In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
pd.options.display.width = 500
import nltk
import string
import re
from nltk.stem.porter import *

In [2]:
df_bal = pd.read_csv('labeled_data.csv')
df_bal = df_bal[df_bal['class'] != 2].iloc[:,1:]

In [3]:
df_bal['class'] = df_bal['class'].astype('int')

In [4]:
#Data Preprocessing
import nltk
stopwords = nltk.corpus.stopwords.words('english')
other_exclusions = ["#ff", "ff", "rt"]
stopwords.extend(other_exclusions)
from spacy.lang.en.stop_words import STOP_WORDS
stopwords.append(STOP_WORDS)

def remove_stopwords(tokens):
    return [t for t in tokens if t not in stopwords]

def preprocess(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    non_text = '[^\w]'
    num_pattern = '\d'
    text_string = " ".join(remove_stopwords(text_string.split()))
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(num_pattern, '', parsed_text)
    parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
    parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)
    parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text)
    parsed_text = re.sub(non_text,' ',parsed_text)
    parsed_text = parsed_text.lower().strip()
    return parsed_text

In [5]:
df_bal['tweet_clean'] = ''
for i, row in df_bal.iterrows():
    df_bal.at[i, 'tweet_clean'] = preprocess(row.tweet)

In [7]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [8]:
for i, row in df_bal.iterrows():
    doc = nlp(str(row['tweet_clean']))
    sym = []
    nouns = []
    verbs = []
    lemmas = []
        
    for token in doc:
        lemmas.append(token.lemma_)
        if token.pos == 'SYM':
            sym.append(token.lemma_)
        if token.pos_ == 'NOUN' or token.pos_ == 'PROPN':
            nouns.append(token.lemma_)
        if token.pos_ == 'VERB':
            verbs.append(token.lemma_)
        
    df_bal.at[i, 'tweet_lemma'] = ' '.join(lemmas)
    df_bal.at[i, 'tweet_nouns'] = ' '.join(nouns)
    df_bal.at[i, 'tweet_sym'] = ' '.join(sym)
    df_bal.at[i, 'tweet_verbs'] = ' '.join(verbs)
    df_bal.at[i, 'tweet_nv'] = ' '.join(nouns + verbs)
    df_bal.at[i, 'num_tokens'] = len(lemmas)

In [9]:
#remove mentions, urls, hashtags, ;&, and 'rt' and other punctuation. keep a count of mentions, urls, hashtags
tweets = df_bal['tweet_lemma']

mentions = []
urls = []
hashtags = []
i = 0
for tweet in tweets:
    tweet = tweet.split()
    mentions.append(tweet.count('mentionhere')+tweet.count('mentionhere:')+tweet.count('"mentionhere:')+tweet.count('&#;mentionhere:'))
    urls.append(tweet.count('urlhere'))
    hashtags.append(tweet.count('hashtaghere'))
    tweet = [token for token in tweet if token not in [';&','']]
    tweet = [token for token in tweet if token not in ['&#;mentionhere:','mentionhere:','"mentionhere:','mentionhere', 'urlhere', 'hashtaghere', 'rt', 'amp']]
    tweet = " ".join(tweet)
    tweets[i] = tweet
    i += 1
    
df_bal['tweet_no_others'] = tweets

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


In [10]:
corpus = df_bal['tweet_no_others']

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(min_df=0., max_df=1.)
cv_matrix = cv.fit_transform(corpus)
cv_matrix = cv_matrix.toarray()
cv_matrix

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [13]:
# get all unique words in the corpus
vocab = cv.get_feature_names()
# show document feature vectors
df_BOW = pd.DataFrame(cv_matrix, columns=vocab)
df_BOW['class'] = df_bal['class']
df_BOW

Unnamed: 0,aa,aaaaaaaaand,aaahhhhh,aahahah,aaliyah,aap,aaron,aaronmacgruder,aaryn,ab,...,zone,zoning,zoo,zoom,zoote,zrgrizz,zuko,zulema,zulu,zzzzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20615,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20616,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20617,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20618,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
