# Predict Tweet Toxicity

In [13]:
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack

In [2]:
def pre_process(df):
    print("Initial dataframe length: ", len(df))

    # drop NAs
    df = df.dropna()
    # drop duplicates
    df = df.drop_duplicates()
    
    df['text'] = df['text'].apply(str)
    
    print("# of Unique ID's: ", len(df.id.unique()))
    print("Final dataframe length: ", len(df))
    return df

In [4]:
filename = 'test2'
input_filename = "../twitter_data/ian_scraped_tweets/"+filename

In [5]:
colnames= ['index','date', 'fullname', 'id', 'likes', 'replies', 'retweets', 'text', 'url', 'user', 'month']
df = pd.read_csv(input_filename, encoding = "ISO-8859-1", names=colnames)
df.columns

Index(['index', 'date', 'fullname', 'id', 'likes', 'replies', 'retweets',
       'text', 'url', 'user', 'month'],
      dtype='object')

In [6]:
#df = pd.read_csv(input_filename, encoding = "ISO-8859-1",
#                 usecols = ['date', 'fullname', 'id', 'likes', 'replies', 
#                            'retweets', 'text', 'url', 'user', 'month'])
df = pre_process(df)
df.head(3)

Initial dataframe length:  572
# of Unique ID's:  572
Final dataframe length:  572


Unnamed: 0,index,date,fullname,id,likes,replies,retweets,text,url,user,month
0,0,2018-01-01 23:59:54,New Republitarian,947980735717265409,2,0,3,"If ""Dreamers"" were known to lean 80+% Republic...",/NuRepublitarian/status/947980735717265409,NuRepublitarian,2018-01-01
1,1,2018-01-01 23:59:51,Fuck DACA,947980722517757952,0,1,0,Maybe they are in fear of deportation because ...,/daca_fuck/status/947980722517757952,daca_fuck,2018-01-01
2,2,2018-01-01 23:59:43,Politicallady,947980686786473984,0,0,0,But what about the DACA children? #DACA,/Politicallady9/status/947980686786473984,Politicallady9,2018-01-01


### Vectorize tweets and predict toxicity with model


In [7]:
# Load vectorizers
word_vectorizer = pickle.load(open("../toxicity_models/word_vectorizer.pickle", "rb" ))
char_vectorizer = pickle.load(open("../toxicity_models/char_vectorizer.pickle", "rb" ))

AttributeError: Can't get attribute 'LemmaTokenizer' on <module '__main__'>

In [19]:
# Vectorize tweets
tweet_word_features = word_vectorizer.transform(df['text'])
tweet_char_features = char_vectorizer.transform(df['text'])
tweet_features = hstack([tweet_word_features, tweet_char_features])

In [20]:
target_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [21]:
# Load models
loaded_models = {}
for col in target_columns:
    loaded_models[col] = pickle.load(open('../toxicity_models/model_{}.sav'.format(col), "rb" ))

In [22]:
# Run models
for col in target_columns:
    df[col] = loaded_models[col].predict_proba(tweet_features)[:, 1]

In [23]:
df['classification'] = df[target_columns].idxmax(axis=1)
df.head(3)

Unnamed: 0,date,fullname,id,likes,replies,retweets,text,url,user,month,toxic,severe_toxic,obscene,threat,insult,identity_hate,classification
0,2017-10-13 23:59:48,The Rose Bushes,918989678929436673,0.0,0.0,0.0,Undocumented Teen Held In Texas Is At The Hear...,/TheRoseBushes/status/918989678929436673,TheRoseBushes,2017-10-01,0.025666,0.001729,0.001311,4.7e-05,0.000224,8.9e-05,toxic
1,2017-10-13 23:59:17,Mr. Matthew,918989549640052736,2.0,1.0,1.0,https://www.youtube.com/watch?v=IPl5bCcwMMo&t=...,/MattSmith1776/status/918989549640052736,MattSmith1776,2017-10-01,0.008181,0.000374,0.001491,2.5e-05,0.000939,0.000108,toxic
2,2017-10-13 23:59:15,Everything,918989540240576514,0.0,0.0,0.0,New story on NPR: Undocumented Teen Held In Te...,/iTweet_News/status/918989540240576514,iTweet_News,2017-10-01,0.059671,0.001845,0.002058,0.000102,0.000445,0.000101,toxic


In [24]:
# export df
filename = "classified"+filename[6:]
export_filename = "../classified/ian/"+filename
df.to_csv(export_filename)

### Visualizations

In [12]:
# visualize obscenity
df.groupby('date')['toxic'].mean().plot()

KeyError: 'Column not found: toxic'

In [48]:
df['likes'] = df['likes'].astype(float)

# normalizing identity hate by tweet popularity
df['popular_obscene'] = df['obscene']*df['likes']
df.groupby('date')['popular_obscene'].mean().plot()

<matplotlib.axes._subplots.AxesSubplot at 0x1a289e8518>

In [27]:
# normalizing identity hate by number of popular tweets
(df.groupby('date')['popular_obscene'].sum() / df.groupby('date')['likes'].sum()).plot()

<matplotlib.axes._subplots.AxesSubplot at 0x1a289e8518>

In [28]:
# normalizing popular tweets by number of popular tweets
(df.groupby('date')['popular_obscene'].sum() / df.groupby('date')['likes'].count()).plot()

<matplotlib.axes._subplots.AxesSubplot at 0x1a289e8518>