# Index  
[Goal of the notebook](#Goal-of-the-notebook)  
[Data Visulaization](#Data-Visulaization)  
[Prepare data for ML Classifier](#Prepare-data-for-ML-Classifier)  
[ML Pipeline](#ML-Pipeline)  
[Evaluation](#Evaluation)  
[Conclusion](#Conclusion)  

WARNING: due to the nature of the data there will be some swear words in the Data Visulaization section if you do not wish to view such words please skip the mentioned section.  

[GitHub](https://github.com/FancyWhale69/toxic_tweets_classifier)  
[DashBoard](https://toxic-tweets.herokuapp.com/)

# Goal of the notebook  

The goal of this notebook is to understand some characteristics of toxic tweets and to build a ML pipeline to classify tweets into toxic or non toxic

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import string
from nltk.stem import SnowballStemmer
import re
import plotly.graph_objects as go
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix
stemmer= SnowballStemmer('english')

In [None]:
df= pd.read_csv('../input/toxic-tweets-dataset/FinalBalancedDataset.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
f= open('../input/stop-words/stop_words_english.txt', 'r', encoding='utf-8')
stop_wordsV2= f.readlines()
f.close()

for i in range(len(stop_wordsV2)):
    stop_wordsV2[i]= stop_wordsV2[i].replace('\n', '')
    
for i in range(len(stop_wordsV2)):
    if "'" in stop_wordsV2[i]:
        stop_wordsV2.append(stop_wordsV2[i].replace("'", ''))

stop_wordsV2.append('i')

# Data Visulaization

In [None]:
data=df.groupby('Toxicity').count()['tweet']
fig = go.Figure([go.Bar(x=data.index.get_level_values(0), y=data.values, text=data.values)])
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', title='Dist. of data classes',
                 xaxis_title='Classes (0=Non-Toxic, 1=Toxic)', yaxis_title='Count')
fig.show()

In [None]:

words= df[df['Toxicity']==1]['tweet'].apply(lambda x : [w for w in re.sub(' +', " ", re.sub("@[0-9a-zA-Z]+|#|https?://[0-9a-zA-Z\./\-_\?]+|â¦|(amp)|[0-9]+", "", x)).translate(str.maketrans('', '', string.punctuation)).strip().split() if not w.lower() in stop_wordsV2])
word={'words':[]}
for group in words:
    for d in group:
        word['words'].append(stemmer.stem(d))

In [None]:
data=pd.DataFrame(word).value_counts()[:10]
fig = go.Figure([go.Bar(x=data.index.get_level_values(0), y=data.values, text=data.values)])
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', title='Top 10 words in toxic tweets',
                 xaxis_title='Words', yaxis_title='Count')
fig.show()

In [None]:

words= df[df['Toxicity']==0]['tweet'].apply(lambda x : [w for w in re.sub(' +', " ", re.sub("@[0-9a-zA-Z]+|#|https?://[0-9a-zA-Z\./\-_\?]+|â¦|(amp)|[0-9]+", "", x)).translate(str.maketrans('', '', string.punctuation)).strip().split() if not w.lower() in stop_wordsV2])
word={'words':[]}
for group in words:
    for d in group:
        word['words'].append(stemmer.stem(d))

In [None]:
data=pd.DataFrame(word).value_counts()[:10]
fig = go.Figure([go.Bar(x=data.index.get_level_values(0), y=data.values, text=data.values)])
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', title='Top 10 words in non-toxic tweets',
                 xaxis_title='Words', yaxis_title='Count')
fig.show()

In [None]:
words= df[df['Toxicity']==1]['tweet'].apply(lambda x : re.findall('#[a-zA-Z_0-9]+', x))
word={'words':[]}
for group in words:
    for d in group:
        if len(re.findall('#[0-9]+',d)) == 0:#filter hashtags which are all numbers (e.g. #198473)
            word['words'].append(d)
data=pd.DataFrame(word).value_counts()[:10]
fig = go.Figure([go.Bar(x=data.index.get_level_values(0), y=data.values, text=data.values)])
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', title='Top 10 toxic hashtags',
                 xaxis_title='Hashtag', yaxis_title='Number of apperence')
fig.show()

In [None]:
words= df[df['Toxicity']==0]['tweet'].apply(lambda x : re.findall('#[a-zA-Z_0-9]+', x))
word={'words':[]}
for group in words:
    for d in group:
        if len(re.findall('#[0-9]+',d)) == 0:#filter hashtags which are all numbers (e.g. #198473)
            word['words'].append(d)
data=pd.DataFrame(word).value_counts()[:10]
fig = go.Figure([go.Bar(x=data.index.get_level_values(0), y=data.values, text=data.values)])
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', title='Top 10 non-toxic hashtags',
                 xaxis_title='Hashtag', yaxis_title='Number of apperence')
fig.show()

# Prepare data for ML Classifier

In [None]:
#Clean text of meaningless words
def clean_words(x):
    '''
    Function to remove stop words, Hashtags, numbers, etc...
    
    Input- String
    Output- String cleaned of meaningless words
    '''
    a= [w for w in re.sub(' +', " ", re.sub(r"#[0-9]+|@[0-9a-zA-Z]+|#|https?://[0-9a-zA-Z\./\-_\?]+|â¦|(amp)|[^\x20-\x7e]|â|¥|ð|»|¼|ï|¸|¦|±|¯|[0-9]+", "", x)).translate(str.maketrans('', '', string.punctuation)).strip().split() if not w.lower() in stop_wordsV2]
    return " ".join([stemmer.stem(i) for i in a])
    
df['Cleaned_tweets']= df['tweet'].apply(lambda x : clean_words(x))

In [None]:
#Split data
from sklearn.model_selection import train_test_split
x=df['Cleaned_tweets']
y=df['Toxicity']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# ML Pipeline

In [None]:
pipeline= Pipeline([
    ('count', CountVectorizer()),
    ('tf', TfidfTransformer()),
    ('clf', LinearSVC())
])

In [None]:
pipeline.fit(x2_train, y2_train)

# Evaluation

In [None]:
pred= pipeline.predict(x2_test)

In [None]:
print(classification_report(y2_test, pred))

In [None]:
print(confusion_matrix(y2_test, pred))

# Conclusion

From tha data visulazation phase it can observed that toxic tweets contains a lot of swear words and usally found in ploitcal hastags. on the other hand non-toxic tweets contains positive words and mostly found in positive hashtags.

ML model achived an accuracy of 94%, while the LSTM network i built achived from 91% to 93% while being slow in the training phase unlike the ML model.