# Predict Tweet Toxicity

In [3]:
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack

In [38]:
filename = 'scrape_feb_0117_0318'
input_filename = "../twitter_data/ian_scraped_tweets/"+filename

In [39]:
df = pd.read_csv(input_filename, encoding = "ISO-8859-1",
                 usecols = ['date', 'fullname', 'id', 'likes', 'replies', 
                            'retweets', 'text', 'url', 'user', 'month'])
df.head(3)

Unnamed: 0,date,fullname,id,likes,replies,retweets,text,url,user,month
0,2018-02-17 23:59:51,Defend The Switches,965012952406409216,0,0,0,Police shut down 2 unlicensed Big Top Firework...,/DefendtheSwitch/status/965012952406409216,DefendtheSwitch,2018-02-01
1,2018-02-17 23:59:50,Alex Psilakis,965012948208046080,0,0,0,That of course changes by the end of the episo...,/APsilakis/status/965012948208046080,APsilakis,2018-02-01
2,2018-02-17 23:59:49,Cullman Traffic,965012943602704385,1,0,0,CULLMAN CO SHERIFF: HWY 69 N IN FRONT OF BAILE...,/cullmantraffic/status/965012943602704385,cullmantraffic,2018-02-01


### Vectorize tweets and predict toxicity with model


In [40]:
# Load vectorizers
word_vectorizer = pickle.load(open("../models/word_vectorizer.pickle", "rb" ))
char_vectorizer = pickle.load(open("../models/char_vectorizer.pickle", "rb" ))

In [41]:
# Vectorize tweets
tweet_word_features = word_vectorizer.transform(df['text'])
tweet_char_features = char_vectorizer.transform(df['text'])
tweet_features = hstack([tweet_word_features, tweet_char_features])

In [42]:
target_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [43]:
# Load models
loaded_models = {}
for col in target_columns:
    loaded_models[col] = pickle.load(open('../models/model_{}.sav'.format(col), "rb" ))

In [44]:
# Run models
for col in target_columns:
    df[col] = loaded_models[col].predict_proba(tweet_features)[:, 1]

In [45]:
df['classification'] = df[target_columns].idxmax(axis=1)
df.head(3)

Unnamed: 0,date,fullname,id,likes,replies,retweets,text,url,user,month,toxic,severe_toxic,obscene,threat,insult,identity_hate,classification
0,2018-02-17 23:59:51,Defend The Switches,965012952406409216,0,0,0,Police shut down 2 unlicensed Big Top Firework...,/DefendtheSwitch/status/965012952406409216,DefendtheSwitch,2018-02-01,0.234758,0.001351,5e-05,2.6e-05,0.000549,6.4e-05,toxic
1,2018-02-17 23:59:50,Alex Psilakis,965012948208046080,0,0,0,That of course changes by the end of the episo...,/APsilakis/status/965012948208046080,APsilakis,2018-02-01,0.000138,0.002855,0.000795,4.7e-05,7.9e-05,0.000134,severe_toxic
2,2018-02-17 23:59:49,Cullman Traffic,965012943602704385,1,0,0,CULLMAN CO SHERIFF: HWY 69 N IN FRONT OF BAILE...,/cullmantraffic/status/965012943602704385,cullmantraffic,2018-02-01,0.047736,0.000189,0.000388,9e-06,9e-05,2.5e-05,toxic


In [46]:
# export df
filename = "classified"+filename[6:]
export_filename = "../classified/ian/"+filename
df.to_csv(export_filename)

### Visualizations

In [50]:
# visualize obscenity
df.groupby('date')['toxic'].mean().plot()

<matplotlib.axes._subplots.AxesSubplot at 0x1a289e8518>

In [48]:
df['likes'] = df['likes'].astype(float)

# normalizing identity hate by tweet popularity
df['popular_obscene'] = df['obscene']*df['likes']
df.groupby('date')['popular_obscene'].mean().plot()

<matplotlib.axes._subplots.AxesSubplot at 0x1a289e8518>

In [27]:
# normalizing identity hate by number of popular tweets
(df.groupby('date')['popular_obscene'].sum() / df.groupby('date')['likes'].sum()).plot()

<matplotlib.axes._subplots.AxesSubplot at 0x1a289e8518>

In [28]:
# normalizing popular tweets by number of popular tweets
(df.groupby('date')['popular_obscene'].sum() / df.groupby('date')['likes'].count()).plot()

<matplotlib.axes._subplots.AxesSubplot at 0x1a289e8518>