In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
import os
os.chdir('/Users/steve/GetOldTweets3-0.0.10')
import re
import nltk
import contractions

os.environ['KMP_DUPLICATE_LIB_OK']='True'

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression

from sent2vec.vectorizer import Vectorizer

from nltk.corpus import stopwords
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

!git clone https://github.com/facebookresearch/fastText.git
!cd fastText
!python3 setup.py install
import fasttext.util

from imblearn.combine import SMOTETomek

from gensim.parsing.preprocessing import remove_stopwords, preprocess_string, strip_tags, strip_punctuation, stem_text, preprocess_documents, strip_multiple_whitespaces, strip_non_alphanum, strip_short

fatal: destination path 'fastText' already exists and is not an empty directory.
python3: can't open file 'setup.py': [Errno 2] No such file or directory


In [21]:
nltk.download('words')
words = set(nltk.corpus.words.words())

[nltk_data] Downloading package words to /Users/steve/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [22]:
CUSTOM_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation, strip_non_alphanum, stem_text,
                  remove_stopwords, strip_short]

def prep(sentence): 
    sentence = contractions.fix(sentence)
    split_sentence = sentence.split()
    return preprocess_string(sentence, CUSTOM_FILTERS)

def prep_withspell(sentence):
    holder = " "
    sentence = contractions.fix(sentence)
    preprocessed = preprocess_string(sentence, CUSTOM_FILTERS)
    spell_correct = TextBlob(holder.join(preprocessed))
    corrected_sentence = spell_correct.correct()
    return corrected_sentence.split()    

def prepare(sentence): 
    processed_feature = re.sub(r'\W', ' ', str(sentence))
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)
    processed_feature = re.sub(r'^b\s+', '', processed_feature)
    processed_feature = processed_feature.lower()
    stop_words = set(stopwords.words('english'))
    add = ['football', 'league', 'soccer', 'super', 'super league', 'footbal', 'leagu', 'footbal soccer']
    stop_words = set.union(stop_words, add)
    sentence = processed_feature.split()
    return [w for w in sentence if not w.lower() in stop_words]    

In [23]:
def cont_to_multiclass(cont):
    if cont > 0.66:
        return 'highly positive'
    elif cont > 0.33:
        return 'positive'
    elif cont > 0:
        return 'partly positive'
    elif cont > -0.33:
        return 'partly negative'
    elif cont > -0.66:
        return 'negative'
    else:
        return 'highly negative'

def cont_to_binary(cont):
    if cont > 0:
        #return 'positive'
        return 1
    else:
        #return 'negative'
        return 0

In [24]:
def get_document_frequency(data, wi, wj=None):
    if wj is None:
        D_wi = 0
        for l in range(len(data)):
            doc = data[l]
            if wi in doc:
                D_wi += 1
        return D_wi
    D_wj = 0
    D_wi_wj = 0
    for l in range(len(data)):
        doc = data[l]
        if wj in doc:
            D_wj += 1
            if wi in doc:
                D_wi_wj += 1
    return D_wj, D_wi_wj

def get_topic_coherence(beta, data, vocab, seed):
    D = len(data)
    TC = []
    num_topics = len(beta.components_)
    selected = -1
    selected = []
    for k, topic in enumerate(beta.components_):
        print('k: {}/{}'.format(k, num_topics))
        top_10 = topic.argsort()[:-20 - 1:-1]
        top_words = [vocab[i] for i in top_10]
        print(top_words)
        TC_k = 0
        counter = 0
        for i, word in enumerate(top_words):
            D_wi = get_document_frequency(data, word)
            j = i + 1
            tmp = 0
            while j < len(top_10) and j > i:
                D_wj, D_wi_wj = get_document_frequency(data, word, top_words[j])
                if D_wi_wj == 0:
                    f_wi_wj = -1
                else:
                    f_wi_wj = -1 + ( np.log(D_wi) + np.log(D_wj)  - 2.0 * np.log(D) ) / ( np.log(D_wi_wj) - np.log(D) )
                tmp += f_wi_wj
                j += 1
                counter += 1
            TC_k += tmp 
        TC.append(TC_k)
    print('num topics: ', len(TC))
    print('Topic Coherence is: {}'.format(TC))
    return TC, selected

In [25]:
def sentiment_scores(sentence):
    sid_obj = SentimentIntensityAnalyzer()
    sentiment_dict = sid_obj.polarity_scores(sentence)
    return sentiment_dict['compound']

In [26]:
def word_vector(model, tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0
    for word in tokens:
        try:
            vec += model.wv[word].reshape((1, size))
            count += 1.
        except KeyError:  # handling the case where the token is not in vocabulary
            continue
    if count != 0:
        vec /= count
    return vec

In [40]:
sl_df = pd.read_csv('SL.csv', usecols = ['content'])
sl_df = sl_df.rename(columns={"content": 0})
dataset = sl_df.drop_duplicates()

In [53]:
dataset[1] = dataset.apply(lambda row: re.sub(r'http\S+', '', str(row[0])), axis=1)
dataset[2] = dataset.apply(lambda row: ''.join([c for c in row[1] if not c.isdigit()]), axis=1)
dataset[3] = dataset.apply(lambda row : " ".join(w for w in nltk.wordpunct_tokenize(row[2]) if w.lower() in words or not w.isalpha()), axis = 1)
dataset[4] = dataset.apply(lambda row : prep(row[3]), axis = 1)
dataset[5] = dataset.apply(lambda row : prepare(row[3]), axis = 1)

processed_data = [" ".join(x) for x in dataset[4]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset[1] = dataset.apply(lambda row: re.sub(r'http\S+', '', str(row[0])), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset[2] = dataset.apply(lambda row: ''.join([c for c in row[1] if not c.isdigit()]), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset[3] = dataset.appl

In [58]:
tfidf = TfidfVectorizer(max_df=0.90, min_df=50, stop_words='english', ngram_range=(1,3))
dtm = tfidf.fit_transform(processed_data)

nmf_model = NMF(n_components=50, random_state=42, beta_loss='kullback-leibler', solver='mu', 
                max_iter=1000, alpha=2, l1_ratio=0.5)
nmf_topics = nmf_model.fit_transform(dtm)

for index, topic in enumerate(nmf_model.components_):
    print(f'THE TOP 50 WORDS FOR TOPIC #{index} GIVEN BY NMF:')
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-50:]])
    print('\n')



THE TOP 50 WORDS FOR TOPIC #0 GIVEN BY NMF:
['banner', 'nak', 'global capit larri', 'capit larri', 'apa', 'prison', 'goalkeep', 'expos', 'kalo', 'super leagu finish', 'stuck', 'larri', 'leagu finish', 'smoke', 'cancel', 'tax', 'fpl', 'metaphor global capit', 'global', 'leagu perfect metaphor', 'perfect metaphor global', 'metaphor global', 'chri', 'perfect metaphor', 'bluster super leagu', 'bluster super', 'global capit', 'readi super leagu', 'readi super', 'bluster', 'super leagu curs', 'leagu curs', 'till', 'super leagu tax', 'leagu tax', 'metaphor', 'capit', 'super leagu perfect', 'super leagu materi', 'leagu materi', 'leagu perfect', 'perfect', 'materi', 'main super leagu', 'main super', 'curs', 'main', 'super leagu', 'super', 'leagu']


THE TOP 50 WORDS FOR TOPIC #1 GIVEN BY NMF:
['sinc', 'mani', 'massiv', 'went', 'footbal', 'abl', 'public', 'total', 'onc', 'corrupt', 'given', 'lot', 'probabl', 'govern', 'power', 'noth', 'said', 'anyth', 'chang', 'someth', 'import', 'understand', '

In [59]:
tf_vectorizer = CountVectorizer(analyzer='word', min_df=50, ngram_range=(1, 3))
tf_fit = tf_vectorizer.fit_transform(processed_data)

lda_model = LatentDirichletAllocation(n_components=50, doc_topic_prior=.01)
lda_topics = lda_model.fit_transform(tf_fit)

for index, topic in enumerate(lda_model.components_):
    print(f'THE TOP 50 WORDS FOR TOPIC #{index} GIVEN BY LDA:')
    print([tf_vectorizer.get_feature_names()[i] for i in topic.argsort()[-50:]])
    print('\n')

THE TOP 50 WORDS FOR TOPIC #0 GIVEN BY LDA:
['approv', 'multipl', 'formal', 'seek', 'super leagu chang', 'reviv', 'paul tenorio', 'tenorio', 'eat', 'sport', 'leagu chang', 'competit super leagu', 'competit super', 'realli super leagu', 'premier', 'super leagu follow', 'premier leagu', 'paul', 'inter', 'confirm', 'particip super leagu', 'realli super', 'particip super', 'leagu follow', 'offici', 'announc', 'leagu withdraw super', 'mail', 'leagu withdraw competit', 'daili', 'super leagu competit', 'origin', 'withdraw competit', 'kind', 'leagu competit', 'super leagu realli', 'leagu realli', 'super leagu withdraw', 'particip', 'premier leagu withdraw', 'follow', 'realli', 'leagu withdraw', 'withdraw super leagu', 'withdraw super', 'competit', 'super leagu', 'super', 'withdraw', 'leagu']


THE TOP 50 WORDS FOR TOPIC #1 GIVEN BY LDA:
['someth like', 'thi like', 'imposs', 'everi club', 'act like', 'tax', 'end', 'feel like', 'noth super leagu', 'game', 'super leagu everi', 'noth super', 'supe

In [60]:
naming={0:'Topic0', 1:'Topic1', 2:'Topic2', 3:'Topic3', 4:'Topic4', 5:'Topic5', 6:'Topic6', 7:'Topic7',
        8:'Topic8', 9:'Topic9', 10:'Topic10', 11:'Topic11', 12:'Topic12', 13:'Topic13', 14:'Topic14', 15:'Topic15',
        16:'Topic16', 17:'Topic17', 18:'Topic18', 19:'Topic19', 20:'Topic20', 21:'Topic21', 22:'Topic22', 23:'Topic23',
        24:'Topic24', 25:'Topic25', 26:'Topic26', 27:'Topic27', 28:'Topic28', 29:'Topic29', 30:'Topic30', 31:'Topic31',
        32:'Topic32', 33:'Topic33', 34:'Topic34', 35:'Topic35', 36:'Topic36', 37:'Topic37', 38:'Topic38', 39:'Topic39',
        40:'Topic40', 41:'Topic41', 42:'Topic42', 43:'Topic43', 44:'Topic44', 45:'Topic45', 46:'Topic46', 47:'Topic47',
        48:'Topic48', 49:'Topic49'}


dataset[5] = nmf_topics.argmax(axis=1)
dataset[5] = dataset[5].map(naming)
dataset[6] = lda_topics.argmax(axis=1)
dataset[6] = dataset[6].map(naming)
dataset[7] = dataset[4].apply(lambda row: ' '.join(row))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset[5] = nmf_topics.argmax(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset[5] = dataset[5].map(naming)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset[6] = lda_topics.argmax(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index

In [61]:
lab_num = 20000
labeled_sentiment = dataset.sample(n=lab_num)
labeled_sentiment[9] = labeled_sentiment.apply(lambda row : sentiment_scores(row[2]), axis = 1)
labeled_sentiment[10] = labeled_sentiment[9].apply(lambda row: cont_to_binary(row))
dataset = dataset.join(labeled_sentiment[10], how='left', lsuffix='_left', rsuffix='_right')

In [65]:
%%capture
fasttext.util.download_model('en', if_exists='ignore')
ft_model = fasttext.load_model('cc.en.300.bin')
ft = pd.DataFrame(processed_data, columns=['tweets'])
ft['tweets'] = ft['tweets'].apply(lambda row: ft_model.get_sentence_vector(row))
ft = np.stack(ft['tweets'].to_numpy())

OSError: [Errno 28] No space left on device

In [66]:
dtm_ar = dtm.toarray()
tf_fit_ar = tf_fit.toarray()
embed_array = np.hstack((dtm_ar, tf_fit_ar, ft))
embed_array_df = pd.DataFrame(embed_array, index=dataset.index)
labeled_sentiment_embeddings = labeled_sentiment.join(embed_array_df, how='left', lsuffix='_left', rsuffix='_right')
labeled_sentiment_embeddings.drop(['0_left', '1_left', '2_left', '3_left', '4_left', '5_left'], axis=1, inplace=True)
labeled_sentiment_embeddings.drop(['6_left','7_left', '8_left', '9_left'], axis=1, inplace=True)
print('here')

smt = SMOTETomek(random_state=42)
X_train, X_test, y_train, y_test = train_test_split(labeled_sentiment_embeddings, labeled_sentiment[9], test_size=0.1, random_state=42)
X_train, y_train = smt.fit_resample(X_train, y_train)

NameError: name 'ft' is not defined

In [None]:
X_train_dtm = X_train[X_train.columns[0:dtm.shape[1]]]
X_train_tf = X_train[X_train.columns[dtm.shape[1]:(dtm.shape[1]+tf_fit.shape[1])]]
X_train_tf = X_train[X_train.columns[dtm.shape[1]:(dtm.shape[1]+tf_fit.shape[1])]]
X_train_ft = X_train[X_train.columns[(dtm.shape[1]+tf_fit.shape[1]):(dtm.shape[1]+tf_fit.shape[1]+ft.shape[1])]]
X_test_dtm = X_test[X_test.columns[0:dtm.shape[1]]]
X_test_tf = X_test[X_test.columns[dtm.shape[1]:(dtm.shape[1]+tf_fit.shape[1])]]
X_test_ft = X_test[X_test.columns[(dtm.shape[1]+tf_fit.shape[1]):(dtm.shape[1]+tf_fit.shape[1]+ft.shape[1])]]
dtm_df = pd.DataFrame(dtm_ar, columns=X_test_dtm.columns)
tf_fit_df = pd.DataFrame(tf_fit_ar, columns=X_test_tf.columns)
ft_df = pd.DataFrame(ft, columns=X_test_ft.columns)

In [None]:
lab_num=20000
lab_num=int(lab_num/10)
rf_pred = pd.DataFrame(np.empty((lab_num, 3), columns=['dtm', 'tf_fit', 'fasttext'])
etc_pred = pd.DataFrame(np.empty((lab_num, 3)), columns=['dtm', 'tf_fit', 'fasttext'])
sgd_pred = pd.DataFrame(np.empty((lab_num, 3)), columns=['dtm', 'tf_fit', 'fasttext'])

rf_pred_full = pd.DataFrame(np.empty((len(dataset[0]), 3)), columns=['dtm', 'tf_fit', 'fasttext'])
etc_pred_full = pd.DataFrame(np.empty((len(dataset[0]), 3)), columns=['dtm', 'tf_fit', 'fasttext'])
sgd_pred_full = pd.DataFrame(np.empty((len(dataset[0]), 3)), columns=['dtm', 'tf_fit', 'fasttext'])

In [None]:
print("RandomForest: ")
rf_classifier_dtm = RandomForestClassifier(n_estimators=300, max_depth=300, random_state=42)
rf_classifier_dtm.fit(X_train_dtm, y_train)
predictions = rf_classifier_dtm.predict(X_test_dtm)
rf_pred['dtm'] = predictions
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print(accuracy_score(y_test, predictions))
predictions = rf_classifier_dtm.predict(dtm_df)
rf_pred_full['dtm'] = predictions

print("ExtraTree: ")
etc_dtm = ExtraTreesClassifier(n_estimators=300, random_state=42)
etc_dtm.fit(X_train_dtm, y_train)
predictions = etc_dtm.predict(X_test_dtm)
etc_pred['dtm'] = predictions
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))
predictions = etc_dtm.predict(dtm_df)
etc_pred_full['dtm'] = predictions

print("StochasticGradientDescent: ")
sgd_classifier_dtm = SGDClassifier(loss="hinge", penalty="l1")
sgd_classifier_dtm.fit(X_train_dtm, y_train)
predictions = sgd_classifier_dtm.predict(X_test_dtm)
sgd_pred['dtm'] = predictions
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))
predictions = sgd_classifier_dtm.predict(dtm_df)
sgd_pred_full['dtm'] = predictions


In [None]:
print("RandomForest: ")
rf_classifier_tf = RandomForestClassifier(n_estimators=300, max_depth=300, random_state=42)
rf_classifier_tf.fit(X_train_tf, y_train)
predictions = rf_classifier_tf.predict(X_test_tf)
rf_pred['tf_fit'] = predictions
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))
predictions = rf_classifier_tf.predict(tf_fit)
rf_pred_full['tf_fit'] = predictions

print("ExtraTree: ")
etc_tf = ExtraTreesClassifier(n_estimators=300, random_state=42)
etc_tf.fit(X_train_tf, y_train)
predictions = etc_tf.predict(X_test_tf)
etc_pred['tf_fit'] = predictions
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))
predictions = etc_tf.predict(tf_fit)
etc_pred_full['tf_fit'] = predictions

print("StochasticGradientDescent: ")
sgd_classifier_tf = SGDClassifier(loss="hinge", penalty="l1")
sgd_classifier_tf.fit(X_train_tf, y_train)
predictions = sgd_classifier_tf.predict(X_test_tf)
sgd_pred['tf_fit'] = predictions
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))
predictions = sgd_classifier_tf.predict(tf_fit)
sgd_pred_full['tf_fit'] = predictions


In [None]:
print("RandomForest: ")
rf_classifier_ft = RandomForestClassifier(n_estimators=300, max_depth=300, random_state=42)
rf_classifier_ft.fit(X_train_ft, y_train)
predictions = rf_classifier_ft.predict(X_test_ft)
rf_pred['fasttext'] = predictions
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))
predictions = rf_classifier_ft.predict(ft)
rf_pred_full['fasttext'] = predictions

print("ExtraTree: ")
etc_ft = ExtraTreesClassifier(n_estimators=300, random_state=42)
etc_ft.fit(X_train_ft, y_train)
predictions = etc_ft.predict(X_test_ft)
etc_pred['fasttext'] = predictions
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))
predictions = etc_ft.predict(ft)
etc_pred_full['fasttext'] = predictions

print("StochasticGradientDescent: ")
sgd_classifier_ft = SGDClassifier(loss="hinge", penalty="l1")
sgd_classifier_ft.fit(X_train_ft, y_train)
predictions = sgd_classifier_ft.predict(X_test_ft)
sgd_pred['fasttext'] = predictions
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))
predictions = sgd_classifier_ft.predict(ft)
sgd_pred_full['fasttext'] = predictions


In [None]:
print("RF Model: ")
rf_model = LogisticRegression(random_state=42).fit(rf_pred, y_test)
predictions = rf_model.predict(rf_pred)
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))

print("ETC Model: ")
etc_model = LogisticRegression(random_state=42).fit(etc_pred, y_test)
predictions = etc_model.predict(etc_pred)
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))

print("SGD Model: ")
sgd_model = LogisticRegression(random_state=42).fit(sgd_pred, y_test)
predictions = sgd_model.predict(sgd_pred)
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))


In [None]:
dataset[10] = rf_model.predict(rf_pred_full)
dataset[11] = etc_model.predict(etc_pred_full)
dataset[12] = sgd_model.predict(sgd_pred_full)
dataset[13] = xgb_model.predict(xgb_pred_full)

sent_pred = dataset[9]
ml_pred = dataset[11]
sent_pred[np.isnan(sent_pred)] = ml_pred
dataset[14] = sent_pred.to_frame()

In [None]:
race_nmf, race_lda = 'Topic28', 'Topic3'
gun_nmf, gun_lda = 'Topic31', 'Topic50'
mask_nmf, mask_lda = 'Topic27', 'Topic5'
resist_nmf, resist_lda = 'Topic25', 'Topic42'
immig_nmf, immig_lda = 'Topic45', 'Topic44'
race_nmf = dataset[dataset[5] == race_nmf]
gun_nmf = dataset[dataset[5] == gun_nmf]
mask_nmf = dataset[dataset[5] == mask_nmf]
resist_nmf = dataset[dataset[5] == resist_nmf]
immig_nmf = dataset[dataset[5] == immig_nmf]

from collections import Counter
race_nmf_vol = pd.DataFrame.from_dict(Counter([date[0:7] for date in (race_nmf[0].values)]), orient='index').reset_index()
gun_nmf_vol = pd.DataFrame.from_dict(Counter([date[0:7] for date in (gun_nmf[0].values)]), orient='index').reset_index()
mask_nmf_vol = pd.DataFrame.from_dict(Counter([date[0:7] for date in (mask_nmf[0].values)]), orient='index').reset_index()
resist_nmf_vol = pd.DataFrame.from_dict(Counter([date[0:7] for date in (resist_nmf[0].values)]), orient='index').reset_index()
immig_nmf_vol = pd.DataFrame.from_dict(Counter([date[0:7] for date in (immig_nmf[0].values)]), orient='index').reset_index()

In [None]:
import datetime
race_nmf_vol = race_nmf_vol.sort_values(by=['index'])
race_nmf_vol['index'] = [datetime.datetime.strptime(d,"%Y-%m").date() for d in race_nmf_vol['index']]
plt.plot(race_nmf_vol['index'], race_nmf_vol[0])
plt.ylabel('# Posts per Month')
plt.title('Race NMF Volumetric Analysis')
#plt.locator_params(axis="x", nbins=4)
plt.show()

In [None]:
gun_nmf_vol = gun_nmf_vol.sort_values(by=['index'])
gun_nmf_vol['index'] = [datetime.datetime.strptime(d,"%Y-%m").date() for d in gun_nmf_vol['index']]
plt.plot(gun_nmf_vol['index'], gun_nmf_vol[0])
plt.ylabel('# Posts per Month')
plt.title('Gun NMF Volumetric Analysis')
plt.show()

In [None]:
mask_nmf_vol = mask_nmf_vol.sort_values(by=['index'])
mask_nmf_vol['index'] = [datetime.datetime.strptime(d,"%Y-%m").date() for d in mask_nmf_vol['index']]
plt.plot(mask_nmf_vol['index'], mask_nmf_vol[0])
plt.ylabel('# Posts per Month')
plt.title('COVID NMF Volumetric Analysis')
#plt.locator_params(axis="x", nbins=4)
plt.show()

In [None]:
resist_nmf_vol = resist_nmf_vol.sort_values(by=['index'])
resist_nmf_vol['index'] = [datetime.datetime.strptime(d,"%Y-%m").date() for d in resist_nmf_vol['index']]
plt.plot(resist_nmf_vol['index'], resist_nmf_vol[0])
plt.ylabel('# Posts per Month')
plt.title('Resist NMF Volumetric Analysis')
#plt.locator_params(axis="x", nbins=4)
plt.show()

In [None]:
immig_nmf_vol = immig_nmf_vol.sort_values(by=['index'])
immig_nmf_vol['index'] = [datetime.datetime.strptime(d,"%Y-%m").date() for d in immig_nmf_vol['index']]
plt.plot(immig_nmf_vol['index'], immig_nmf_vol[0])
plt.ylabel('# Posts per Month')
plt.title('Immigration NMF Volumetric Analysis')
plt.locator_params(axis="x", nbins=4)
plt.show()

In [None]:
race_sentiment = pd.DataFrame([date for date in (race_nmf[14].values)], index=(race_nmf[0].values))
race_sentiment_pos = race_sentiment.groupby(race_sentiment[0]).get_group(1)
race_sentiment_neg = race_sentiment.groupby(race_sentiment[0]).get_group(0)
race_sentiment_pos = (pd.DataFrame.from_dict(Counter(race_sentiment_pos.index.str[:7]), orient='index').reset_index()).sort_values(by=['index'])
race_sentiment_neg = (pd.DataFrame.from_dict(Counter(race_sentiment_neg.index.str[:7]), orient='index').reset_index()).sort_values(by=['index']).set_index('index')

ax = plt.gca()

race_sentiment_pos.plot(style=[':', '--', '-'], color='blue', ax=ax)
race_sentiment_neg.plot(style=[':', '--', '-'], color='red', ax=ax)
plt.ylabel('# Posts per Month')
plt.xlabel('Time-Series')
plt.xticks(np.arange(0, max(len(race_sentiment_pos), len(race_sentiment_neg))+1, 10))
plt.title('Race NMF Sentiment Analysis')
ax.xaxis.set_major_locator(plt.MaxNLocator(10))
plt.gcf().autofmt_xdate()
plt.show()

In [None]:
gun_sentiment = pd.DataFrame([date for date in (gun_nmf[14].values)], index=(gun_nmf[0].values))
gun_sentiment_pos = gun_sentiment.groupby(gun_sentiment[0]).get_group(1)
gun_sentiment_neg = gun_sentiment.groupby(gun_sentiment[0]).get_group(0)
gun_sentiment_pos = (pd.DataFrame.from_dict(Counter(gun_sentiment_pos[0].index.str[:7]), orient='index').reset_index()).sort_values(by=['index']).set_index('index')
gun_sentiment_neg = (pd.DataFrame.from_dict(Counter(gun_sentiment_neg[0].index.str[:7]), orient='index').reset_index()).sort_values(by=['index']).set_index('index')

ax = plt.gca()

gun_sentiment_pos.plot(style=[':', '--', '-'], color='blue', ax=ax)
gun_sentiment_neg.plot(style=[':', '--', '-'], color='red', ax=ax)
plt.ylabel('# Gun per Month')
plt.xlabel('Time-Series')
plt.title('Immigration NMF Sentiment Analysis')
ax.xaxis.set_major_locator(plt.MaxNLocator(10))
plt.gcf().autofmt_xdate()
plt.show()

In [None]:
mask_sentiment = pd.DataFrame([date for date in (mask_nmf[14].values)], index=(mask_nmf[0].values))
mask_sentiment_pos = mask_sentiment.groupby(mask_sentiment[0]).get_group(1)
mask_sentiment_neg = mask_sentiment.groupby(mask_sentiment[0]).get_group(0)
mask_sentiment_pos = (pd.DataFrame.from_dict(Counter(mask_sentiment_pos[0].index.str[:7]), orient='index').reset_index()).sort_values(by=['index']).set_index('index')
mask_sentiment_neg = (pd.DataFrame.from_dict(Counter(mask_sentiment_neg[0].index.str[:7]), orient='index').reset_index()).sort_values(by=['index']).set_index('index')

ax = plt.gca()

mask_sentiment_pos.plot(style=[':', '--', '-'], color='blue', ax=ax)
mask_sentiment_neg.plot(style=[':', '--', '-'], color='red', ax=ax)
plt.ylabel('# Posts per Month')
plt.xlabel('Time-Series')
plt.title('Healthcare NMF Sentiment Analysis')
ax.xaxis.set_major_locator(plt.MaxNLocator(10))
plt.gcf().autofmt_xdate()
plt.show()

In [None]:
resist_sentiment = pd.DataFrame([date for date in (resist_nmf[14].values)], index=(resist_nmf[0].values))
resist_sentiment_pos = resist_sentiment.groupby(resist_sentiment[0]).get_group(1)
resist_sentiment_neg = resist_sentiment.groupby(resist_sentiment[0]).get_group(0)
resist_sentiment_pos = (pd.DataFrame.from_dict(Counter(resist_sentiment_pos[0].index.str[:7]), orient='index').reset_index()).sort_values(by=['index']).set_index('index')
resist_sentiment_neg = (pd.DataFrame.from_dict(Counter(resist_sentiment_neg[0].index.str[:7]), orient='index').reset_index()).sort_values(by=['index']).set_index('index')

ax = plt.gca()

resist_sentiment_pos.plot(style=[':', '--', '-'], color='blue', ax=ax)
resist_sentiment_neg.plot(style=[':', '--', '-'], color='red', ax=ax)
plt.ylabel('# Posts per Month')
plt.xlabel('Time-Series')
plt.title('Racism NMF Sentiment Analysis')
ax.xaxis.set_major_locator(plt.MaxNLocator(10))
plt.gcf().autofmt_xdate()
plt.show()

In [None]:
immig_sentiment = pd.DataFrame([date for date in (immig_nmf[14].values)], index=(immig_nmf[0].values))
immig_sentiment_pos = immig_sentiment.groupby(immig_sentiment[0]).get_group(1)
immig_sentiment_neg = immig_sentiment.groupby(immig_sentiment[0]).get_group(0)
immig_sentiment_pos = (pd.DataFrame.from_dict(Counter(immig_sentiment_pos[0].index.str[:7]), orient='index').reset_index()).sort_values(by=['index']).set_index('index')
immig_sentiment_neg = (pd.DataFrame.from_dict(Counter(immig_sentiment_neg[0].index.str[:7]), orient='index').reset_index()).sort_values(by=['index']).set_index('index')

ax = plt.gca()

resist_sentiment_pos.plot(style=[':', '--', '-'], color='blue', ax=ax)
resist_sentiment_neg.plot(style=[':', '--', '-'], color='red', ax=ax)
plt.ylabel('# Posts per Month')
plt.xlabel('Time-Series')
plt.title('Immigration NMF Sentiment Analysis')
ax.xaxis.set_major_locator(plt.MaxNLocator(10))
plt.gcf().autofmt_xdate()
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))

ax.plot(race_sentiment_neg[0].index,
        race_sentiment_neg[0], 
        linestyle=":", 
        color='red')
ax.plot(race_sentiment_pos[0].index,
        race_sentiment_pos[0],
        color='blue',
        linestyle="--")
ax.set(xlabel="Date",
       ylabel="Negative Sentiment",
       title="Daily Total Precipitation\nBoulder, Colorado in July 2018")
plt.gcf().autofmt_xdate()
ax.xaxis.set_major_locator(plt.MaxNLocator(30))
plt.show()

In [None]:
immig_sentiment_pos.index