# Predict Tweet Toxicity

In [3]:
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack

In [4]:
filename = 'scrape_2a_0117_0318'
input_filename = "../twitter_data/ian_scraped_tweets/"+filename

In [5]:
df = pd.read_csv(input_filename, encoding = "ISO-8859-1",
                 usecols = ['date', 'fullname', 'id', 'likes', 'replies', 
                            'retweets', 'text', 'url', 'user', 'month'])
df.head(3)

Unnamed: 0,date,fullname,id,likes,replies,retweets,text,url,user,month
0,2018-03-05 23:59:55,NonPCconservative,970811174634663937,0,0,0,The type of people who complain about our 2nd ...,/Kill_PC/status/970811174634663937,Kill_PC,2018-03-01
1,2018-03-05 23:59:32,Â ðºð¸Â ðºð¸Michael OdincovÂ ðºð¸Â...,970811079738740737,0,0,0,Retweeted Erin'sTrumpÂ® (@EjHirschberger):\n\n...,/Franktorbino/status/970811079738740737,Franktorbino,2018-03-01
2,2018-03-05 23:59:30,ð£ðªðªð¯ðððð,970811068942499840,0,0,1,Gateway Pundit News \n Violent Leftists Storm...,/nuuzfeed/status/970811068942499840,nuuzfeed,2018-03-01


### Vectorize tweets and predict toxicity with model


In [6]:
# Load vectorizers
word_vectorizer = pickle.load(open("../models/word_vectorizer.pickle", "rb" ))
char_vectorizer = pickle.load(open("../models/char_vectorizer.pickle", "rb" ))

In [7]:
# Vectorize tweets
tweet_word_features = word_vectorizer.transform(df['text'])
tweet_char_features = char_vectorizer.transform(df['text'])
tweet_features = hstack([tweet_word_features, tweet_char_features])

In [8]:
target_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [9]:
# Load models
loaded_models = {}
for col in target_columns:
    loaded_models[col] = pickle.load(open('../models/model_{}.sav'.format(col), "rb" ))

In [10]:
# Run models
for col in target_columns:
    df[col] = loaded_models[col].predict_proba(tweet_features)[:, 1]

In [11]:
df['classification'] = df[target_columns].idxmax(axis=1)
df.head(3)

Unnamed: 0,date,fullname,id,likes,replies,retweets,text,url,user,month,toxic,severe_toxic,obscene,threat,insult,identity_hate,classification
0,2018-03-05 23:59:55,NonPCconservative,970811174634663937,0,0,0,The type of people who complain about our 2nd ...,/Kill_PC/status/970811174634663937,Kill_PC,2018-03-01,0.001541,0.001014,0.00024,8.5e-05,0.000501,4.6e-05,toxic
1,2018-03-05 23:59:32,Â ðºð¸Â ðºð¸Michael OdincovÂ ðºð¸Â...,970811079738740737,0,0,0,Retweeted Erin'sTrumpÂ® (@EjHirschberger):\n\n...,/Franktorbino/status/970811079738740737,Franktorbino,2018-03-01,0.012964,0.002905,0.002182,0.000103,0.001523,0.00017,toxic
2,2018-03-05 23:59:30,ð£ðªðªð¯ðððð,970811068942499840,0,0,1,Gateway Pundit News \n Violent Leftists Storm...,/nuuzfeed/status/970811068942499840,nuuzfeed,2018-03-01,0.00245,0.005353,0.000565,5.4e-05,0.000149,2.3e-05,severe_toxic


In [12]:
# export df
filename = "classified"+filename[6:]
export_filename = "../classified/ian/"+filename
df.to_csv(export_filename)

### Visualizations

In [13]:
# visualize obscenity
df.groupby('date')['toxic'].mean().plot()

<matplotlib.axes._subplots.AxesSubplot at 0x1a289e8518>

In [14]:
df['likes'] = df['likes'].astype(float)

# normalizing identity hate by tweet popularity
df['popular_obscene'] = df['obscene']*df['likes']
df.groupby('date')['popular_obscene'].mean().plot()

<matplotlib.axes._subplots.AxesSubplot at 0x1a289e8518>

In [15]:
# normalizing identity hate by number of popular tweets
(df.groupby('date')['popular_obscene'].sum() / df.groupby('date')['likes'].sum()).plot()

<matplotlib.axes._subplots.AxesSubplot at 0x1a289e8518>

In [40]:
# normalizing popular tweets by number of popular tweets
(df.groupby('date')['popular_obscene'].sum() / df.groupby('date')['likes'].count()).plot()

<matplotlib.axes._subplots.AxesSubplot at 0x1a2392f518>