### <font color = #FFE6E6> Import Libraries</font>


In [None]:
import warnings

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import re
import nltk
import string
from nltk.text import Text

plt.style.use('seaborn-v0_8-pastel')
warnings.filterwarnings("ignore")

### <font color = #FFE6E6> Data Ingestion</font>


In [None]:
df = pd.read_csv("TwitterHateSpeech.csv", usecols = ['label', 'tweet'])
data_stats={'Observations':df.shape[0],'Features':df.shape[1],'File_size':df.size,
            'Columns':df.columns,'Data_types':df.dtypes,'Null_vals':df.isnull().sum()}

### <font color = #FFE6E6> Data Understanding</font>


### <font color = #FFE6E6> &nbsp; &nbsp; &nbsp; &nbsp; Predictor and Target Attributes</font>


In [None]:
text = df.iloc[:, 1:]
label = df.iloc[:, 0:1]

### <font color = #FFE6E6> &nbsp; &nbsp; &nbsp; &nbsp; Pie Chart</font>

In [None]:
hateSpeech=df[df['label']==1].shape[0] #w/o .shape[0] is just all hate speech, shape0 gets cols, 1 gets rows
normSpeech=df[df['label']==0].shape[0]

typesSpeech=[hateSpeech,normSpeech]
labels=['Hate Speech','Non-hate speech']
plt.pie(typesSpeech,explode=[0,.1],labels=labels,autopct='%1.1f%%',startangle=-50)
plt.show()

### <font color = #FFE6E6> &nbsp; &nbsp; &nbsp; &nbsp; Word Clouds</font>

In [None]:
from wordcloud import WordCloud
from wordcloud import STOPWORDS

### <font color = #FFE6E6> &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; Hate Speech Word Cloud</font>

In [None]:
hate_speech = df[df['label'] == 1]   
comment_words = ''
stopwords = set(STOPWORDS)
for val in hate_speech.tweet:
    val = str(val)
    tokens = val.split()
    for i in range(len(tokens)):
        tokens[i] = tokens[i].lower()
    comment_words += " ".join(tokens)+" "
 
wordcloud = WordCloud(width = 800, height = 800,
                background_color ='black',
                stopwords = stopwords,
                min_font_size = 12).generate(comment_words)
                      
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()

### <font color = #FFE6E6> &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; Normal Speech Word Cloud</font>

In [None]:
normSpeech = df[df['label'] == 0]   
comment_words = ''
stopwords = set(STOPWORDS)
for val in normSpeech.tweet:
    val = str(val)
    tokens = val.split()
    for i in range(len(tokens)):
        tokens[i] = tokens[i].lower()
    comment_words += " ".join(tokens)+" "

wordcloud = WordCloud(width = 800, height = 800,
                background_color ='black',
                stopwords = stopwords,
                min_font_size = 12).generate(comment_words)
                 
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()

### <font color = #FFE6E6> Data Cleaning</font>

### <font color = #FFE6E6> &nbsp; &nbsp; &nbsp; &nbsp; Text Cleaning</font>

In [None]:
def clean(text):
    """Makes all input text lowercase, cleans of @users, 
    removes punctuation and special characters"""
    newtext=''
    newtext=text.lower()
    newtext=re.sub(r'(@[A-Za-z0-9]+)',"",text)
    newtext=text.translate(str.maketrans('','',string.punctuation))
    newtext=" ".join(e for e in text.split() if e.isalnum())
    return newtext
df['tweet']=df['tweet'].apply(clean)

### <font color = #FFE6E6> &nbsp; &nbsp; &nbsp; &nbsp; Pre-Processing</font>

### <font color = #FFE6E6> &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; Tokenizing</font>

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
def tokenize(text):
    text=word_tokenize(text)
    return text
df['tweet']=df['tweet'].apply(tokenize)

### <font color = #FFE6E6> &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; Removing StopWords</font>

In [None]:
from nltk.corpus import stopwords
stop=stopwords.words('english')
def StopWords(text): #
    """"removes stop words from text"""
    return ' '.join([word for word in text if word not in (stop)])

df['tweet']=df['tweet'].apply(StopWords)

### <font color = #FFE6E6> &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; Lemmatization</font>

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

def lemmatize(text):
    return [lemmatizer.lemmatize(token) for token in text]

# df['tweet']=df['tweet'].apply(lemmatize) #breaks stuff, fix

### <font color = #FFE6E6> Create ML Model</font>

### <font color = #FFE6E6> &nbsp; &nbsp; &nbsp; &nbsp; Featrue Extraction</font>

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer=TfidfVectorizer(max_features=5000)
corpus=df['tweet']
tfidf_matrix=vectorizer.fit_transform(corpus)
text=tfidf_matrix.toarray()

### <font color = #FFE6E6> &nbsp; &nbsp; &nbsp; &nbsp; Split Dataset</font>

In [None]:
from sklearn.model_selection import train_test_split
labels=df.iloc[:,0]
x_train, x_test, y_train, y_test=train_test_split(text,labels,test_size=.3,random_state=0)

print('Training data:',x_train.shape)
print('Testing data:',x_test.shape)