# Predict Tweet Toxicity

In [18]:
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack
%matplotlib

Using matplotlib backend: MacOSX


In [2]:
filename = "twitter_data/andrea_scraped/Hillary2016"

In [3]:
df = pd.read_csv(filename, encoding = "ISO-8859-1",
                 names = ['date', 'fullname', 'id', 'likes', 'replies', 
                            'retweets', 'text', 'url', 'user', 'month'])

df = df.drop_duplicates()
df = df.dropna()
target_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


### Vectorize tweets and predict toxicity with model

In [4]:
# Load vectorizers
word_vectorizer = pickle.load(open("models/word_vectorizer.pickle", "rb" ))
char_vectorizer = pickle.load(open("models/char_vectorizer.pickle", "rb" ))

In [5]:
# Vectorize tweets
tweet_word_features = word_vectorizer.transform(df['text'])
tweet_char_features = char_vectorizer.transform(df['text'])
tweet_features = hstack([tweet_word_features, tweet_char_features])

In [6]:
# Load models
loaded_models = {}
for col in target_columns:
    loaded_models[col] = pickle.load(open('models/model_{}.sav'.format(col), "rb" ))

In [7]:
# Run models
for col in target_columns:
    df[col] = loaded_models[col].predict_proba(tweet_features)[:, 1]

In [8]:
df.head()

Unnamed: 0,date,fullname,id,likes,replies,retweets,text,url,user,month,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,2016-01-11 21:14:18,Hillary 4 President,686657407896961025,1.0,0.0,4.0,So true\n\n--\nDonate $5 to the Hillary campai...,/HillaryNews/status/686657407896961025,HillaryNews,2016-01-01,0.000988,0.00247,0.001376,4.3e-05,0.000911,2e-05
1,2016-01-11 23:05:38,sarah hayse,686685426845925376,5.0,1.0,3.0,If you vote for Hillary Clinton just because s...,/sarchx23/status/686685426845925376,sarchx23,2016-01-01,0.008489,0.00068,0.001027,2.2e-05,8.7e-05,0.000188
2,2016-01-11 22:14:15,Sam,686672494590169088,147.0,8.0,161.0,#Veterans Band Together & Make Massive Move to...,/ArizonaSheepdog/status/686672494590169088,ArizonaSheepdog,2016-01-01,0.000206,0.001398,0.000139,3.2e-05,3.4e-05,5.6e-05
3,2016-01-07 23:30:54,Hillary for Iowa,685242233361059841,2269.0,402.0,1880.0,Retweet if you agree with @billclinton: Hillar...,/HillaryforIA/status/685242233361059841,HillaryforIA,2016-01-01,0.000272,0.0028,0.000413,4.3e-05,7.6e-05,5.2e-05
4,2016-01-10 22:37:18,Robyn,686315907891605504,1506.0,5.0,952.0,"""Hillary Clinton & Donald Trump drop out of pr...",/lilmsragerr/status/686315907891605504,lilmsragerr,2016-01-01,0.000424,0.001486,0.00031,1.5e-05,0.000271,7.2e-05


In [9]:
df['classification'] = df[target_columns].idxmax(axis=1)

In [10]:
df.head()

Unnamed: 0,date,fullname,id,likes,replies,retweets,text,url,user,month,toxic,severe_toxic,obscene,threat,insult,identity_hate,classification
0,2016-01-11 21:14:18,Hillary 4 President,686657407896961025,1.0,0.0,4.0,So true\n\n--\nDonate $5 to the Hillary campai...,/HillaryNews/status/686657407896961025,HillaryNews,2016-01-01,0.000988,0.00247,0.001376,4.3e-05,0.000911,2e-05,severe_toxic
1,2016-01-11 23:05:38,sarah hayse,686685426845925376,5.0,1.0,3.0,If you vote for Hillary Clinton just because s...,/sarchx23/status/686685426845925376,sarchx23,2016-01-01,0.008489,0.00068,0.001027,2.2e-05,8.7e-05,0.000188,toxic
2,2016-01-11 22:14:15,Sam,686672494590169088,147.0,8.0,161.0,#Veterans Band Together & Make Massive Move to...,/ArizonaSheepdog/status/686672494590169088,ArizonaSheepdog,2016-01-01,0.000206,0.001398,0.000139,3.2e-05,3.4e-05,5.6e-05,severe_toxic
3,2016-01-07 23:30:54,Hillary for Iowa,685242233361059841,2269.0,402.0,1880.0,Retweet if you agree with @billclinton: Hillar...,/HillaryforIA/status/685242233361059841,HillaryforIA,2016-01-01,0.000272,0.0028,0.000413,4.3e-05,7.6e-05,5.2e-05,severe_toxic
4,2016-01-10 22:37:18,Robyn,686315907891605504,1506.0,5.0,952.0,"""Hillary Clinton & Donald Trump drop out of pr...",/lilmsragerr/status/686315907891605504,lilmsragerr,2016-01-01,0.000424,0.001486,0.00031,1.5e-05,0.000271,7.2e-05,severe_toxic


In [11]:
# export df
export_filename = "classified/Andrea/Hillary2016"
df.to_csv(export_filename)

### Visualizations

In [20]:
# visualize obscenity
df.groupby('month')['obscene'].mean().plot()

<matplotlib.axes._subplots.AxesSubplot at 0x1a14d50470>

In [13]:
df['likes'] = df['likes'].astype(float)

# normalizing identity hate by tweet popularity
df['popular_obscene'] = df['obscene']*df['likes']
df.groupby('date')['popular_obscene'].mean().plot()

<matplotlib.axes._subplots.AxesSubplot at 0x1a13e0bd30>

In [14]:
# normalizing identity hate by number of popular tweets
(df.groupby('date')['popular_obscene'].sum() / df.groupby('date')['likes'].sum()).plot()

<matplotlib.axes._subplots.AxesSubplot at 0x1a13e0bd30>

In [15]:
# normalizing popular tweets by number of popular tweets
(df.groupby('date')['popular_obscene'].sum() / df.groupby('date')['likes'].count()).plot()

<matplotlib.axes._subplots.AxesSubplot at 0x1a13e0bd30>