Cleaning full dataset

### Data Cleaning

In [3]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
pd.options.display.width = 500
import nltk
import string
import re
from nltk.stem.porter import *

In [4]:
df_bal = pd.read_csv('labeled_data.csv')
df_bal = df_bal[df_bal['class'] != 2].iloc[:,1:]

In [5]:
df_bal['class'] = df_bal['class'].astype('int')

### Data Exploration

In [6]:
#Data Preprocessing
import nltk
stopwords = nltk.corpus.stopwords.words('english')
other_exclusions = ["#ff", "ff", "rt"]
stopwords.extend(other_exclusions)
from spacy.lang.en.stop_words import STOP_WORDS
stopwords.append(STOP_WORDS)

def remove_stopwords(tokens):
    return [t for t in tokens if t not in stopwords]

def preprocess(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    non_text = '[^\w]'
    num_pattern = '\d'
    text_string = " ".join(remove_stopwords(text_string.split()))
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(num_pattern, '', parsed_text)
    parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
    parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)
    parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text)
    parsed_text = re.sub(non_text,' ',parsed_text)
    parsed_text = parsed_text.lower().strip()
    return parsed_text

In [7]:
df_bal['tweet_clean'] = ''
for i, row in df_bal.iterrows():
    df_bal.at[i, 'tweet_clean'] = preprocess(row.tweet)

In [8]:
df_bal['tweet_clean'].head()

1    rt mentionhere  boy dats cold   tyga dwn bad c...
2    rt mentionhere dawg     rt mentionhere  you ev...
3         rt mentionhere  mentionhere look like tranny
4    rt mentionhere  the shit hear might true might...
5    mentionhere  the shit blows me  claim faithful...
Name: tweet_clean, dtype: object

In [9]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [10]:
for i, row in df_bal.iterrows():
    doc = nlp(str(row['tweet_clean']))
    sym = []
    nouns = []
    verbs = []
    lemmas = []
        
    for token in doc:
        lemmas.append(token.lemma_)
        if token.pos == 'SYM':
            sym.append(token.lemma_)
        if token.pos_ == 'NOUN' or token.pos_ == 'PROPN':
            nouns.append(token.lemma_)
        if token.pos_ == 'VERB':
            verbs.append(token.lemma_)
        
    df_bal.at[i, 'tweet_lemma'] = ' '.join(lemmas)
    df_bal.at[i, 'tweet_nouns'] = ' '.join(nouns)
    df_bal.at[i, 'tweet_sym'] = ' '.join(sym)
    df_bal.at[i, 'tweet_verbs'] = ' '.join(verbs)
    df_bal.at[i, 'tweet_nv'] = ' '.join(nouns + verbs)
    df_bal.at[i, 'num_tokens'] = len(lemmas)

In [11]:
df_bal.head()

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet,tweet_clean,tweet_lemma,tweet_nouns,tweet_sym,tweet_verbs,tweet_nv,num_tokens
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,rt mentionhere boy dats cold tyga dwn bad c...,rt mentionhere boy dat cold tyga dwn bad ...,rt mentionhere boy dat tyga dwn cuffin dat hoe...,,,rt mentionhere boy dat tyga dwn cuffin dat hoe...,15.0
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,rt mentionhere dawg rt mentionhere you ev...,rt mentionhere dawg rt mentionhere -PRO...,rt mentionhere dawg rt mentionhere bitch start...,,fuck cry confuse,rt mentionhere dawg rt mentionhere bitch start...,17.0
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,rt mentionhere mentionhere look like tranny,rt mentionhere mentionhere look like tranny,rt mentionhere mentionhere tranny,,look,rt mentionhere mentionhere tranny look,7.0
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,rt mentionhere the shit hear might true might...,rt mentionhere the shit hear may true may fa...,rt mentionhere shit bitch,,hear may may faker tell,rt mentionhere shit bitch hear may may faker tell,13.0
5,3,1,2,0,1,"!!!!!!!!!!!!!!!!!!""@T_Madison_x: The shit just...",mentionhere the shit blows me claim faithful...,mentionhere the shit blow -PRON- claim fai...,mentionhere shit claim hoe,,blow fucking,mentionhere shit claim hoe blow fucking,13.0


In [12]:
#remove mentions, urls, hashtags, ;&, and 'rt' and other punctuation. keep a count of mentions, urls, hashtags
tweets = df_bal['tweet_lemma']

mentions = []
urls = []
hashtags = []
i = 0
for tweet in tweets:
    tweet = tweet.split()
    mentions.append(tweet.count('mentionhere')+tweet.count('mentionhere:')+tweet.count('"mentionhere:')+tweet.count('&#;mentionhere:'))
    urls.append(tweet.count('urlhere'))
    hashtags.append(tweet.count('hashtaghere'))
    tweet = [token for token in tweet if token not in [';&','']]
    tweet = [token for token in tweet if token not in ['&#;mentionhere:','mentionhere:','"mentionhere:','mentionhere', 'urlhere', 'hashtaghere', 'rt', 'amp']]
    tweet = " ".join(tweet)
    tweets[i] = tweet
    i += 1
    
df_bal['tweet_no_others'] = tweets
df_bal['mention_count'] = mentions
df_bal['url_count'] = urls
df_bal['hashtag_count'] = hashtags

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


In [14]:
corpus = df_bal['tweet_no_others']
from sklearn.feature_extraction.text import TfidfVectorizer

tfidfv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
tfidfv_matrix = tfidfv.fit_transform(corpus)
tfidfv_matrix = tfidfv_matrix.toarray()

vocab = tfidfv.get_feature_names()
tfidf_df = pd.DataFrame(np.round(tfidfv_matrix, 2), columns=vocab)
tfidf_df

Unnamed: 0,aa,aaaaaaaaand,aaahhhhh,aahahah,aaliyah,aap,aaron,aaronmacgruder,aaryn,ab,...,zone,zoning,zoo,zoom,zoote,zrgrizz,zuko,zulema,zulu,zzzzzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20615,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20616,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20617,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20618,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
tfidf_df['class'] = df_bal['class']

In [16]:
tfidf_df['class'].fillna(value=0, inplace=True)

In [17]:
tfidf_df['class'] = tfidf_df['class'].astype('int')

In [18]:
tfidf_df['class']

0        0
1        1
2        1
3        1
4        1
        ..
20615    0
20616    1
20617    0
20618    1
20619    1
Name: class, Length: 20620, dtype: int64

In [19]:
new_columns = ['num_tokens', 'mention_count', 'url_count', 'hashtag_count']

for col in new_columns:
    tfidf_df[col] = df_bal[col]

In [20]:
tfidf_df.to_csv('full_tfidf_df.csv')

In [66]:
import sqlite3

con = sqlite3.connect('twitter_hate.db')
tfidf_df.to_sql('full_tfidf_df', con, if_exists='replace', index=False)

OperationalError: too many columns on full_tfidf_df