In [2]:
import pandas as pd
import numpy as np
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve
from IPython.display import clear_output

# Read Data and Clean

In [3]:
df = pd.read_csv('../data/scraped data/league_discord_2.csv', index_col='Unnamed: 0')

In [4]:
df

Unnamed: 0,name,content,timestamp
0,QuantumEnno#5661,why tf swain looks like the guy from call the ...,2020-11-09T03:23:48.052000+00:00
1,3R1CF4N#5701,<:WEIRD:665202887009501192>,2020-11-09T03:25:44.219000+00:00
2,Jay123lol#1296,https://cdn.discordapp.com/attachments/7236198...,2020-11-09T03:27:21.705000+00:00
3,fears#3642,anyone down for some ranked? gold,2020-11-09T03:29:10.062000+00:00
4,fears#3642,or norms,2020-11-09T03:29:30.557000+00:00
...,...,...,...
17050,lauv#0444,thats what i call a clown fiesta,2020-06-25T02:37:50.161000+00:00
17051,skux#6149,you still playing morg jg arazorias?,2020-06-25T02:38:10.499000+00:00
17052,Arazorias#8819,https://cdn.discordapp.com/attachments/5318526...,2020-06-25T02:38:21.398000+00:00
17053,Arazorias#8819,nah morg jungle doesnt honestly feel too good ...,2020-06-25T02:38:40.738000+00:00


In [5]:
df = df.loc[:, ['content']]

In [6]:
df = df.drop_duplicates()

In [6]:
# df = df.reindex(columns=df.columns.to_list() + ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])

In [7]:
# Get rid of Emotes for Discord Messages
df.loc[:, 'content'] = df['content'].str.replace(r'<.*>', '')

# Get rid of urls
df.loc[:, 'content'] = df['content'].str.replace(r'http\S+', '')

# Get Rid of New Line and Strip
df.loc[:, 'content'] = df['content'].str.replace(r'\n', '').str.strip()

In [8]:
# Get rid of empty Strings
df = df[df['content'] != ""]

In [9]:
df = df.reset_index(drop=True)

# Make Predictions Using Logistic Regression

## Setup

In [10]:
# Create list of outputs required
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

## Vectorizer

In [11]:
# Read Vectorizer
filename = 'log_reg_vectorizer.sav'
word_vectorizer = pickle.load(open('../output/' + filename, 'rb'))

## Make Predictions

In [14]:
def get_predictions(model, threshold, x_test, class_name):
    # Make Predictions
    predictions = model.predict_proba(x_test)[:, 1]
    
    # Add Column with Predictions
    df[class_name] = np.where(predictions > threshold, 1, 0)

In [18]:
x_test = word_vectorizer.transform(df['content'].values.astype('U'))
for class_name in classes:
    filename = 'log_reg_' + class_name + '.sav'
    model, threshold = pickle.load(open('../output/' + filename, 'rb'))
    threshold = 0.8 #override optimal threshold as we care more about false positives
    
    get_predictions(model, threshold, x_test, class_name)
    

In [19]:
df.head()

Unnamed: 0,content,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,why tf swain looks like the guy from call the ...,0,0,0,0,0,0
1,anyone down for some ranked? gold,0,0,0,0,0,0
2,or norms,0,0,0,0,0,0
3,y does almost everyone go inspiration in pro l...,0,0,0,0,0,0
4,what,0,0,0,0,0,0


In [21]:
df.loc[:, 'toxic':'identity_hate'].any(axis=1).sum()

277

In [22]:
positive = df[df.loc[:, 'toxic':'identity_hate'].any(axis=1)]

# Convert to CSV

In [23]:
positive.to_csv('../data/predicted data/predicted_league_discord_2.csv')