# Imports

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        URL = os.path.join(dirname, filename)
        print(URL)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import spacy

import datetime
import random

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [None]:
spacy.util.fix_random_seed(0)
np.random.seed(0)
random.seed(0)

# Load Data

In [None]:
data = pd.read_csv(URL)
print(len(data))
data.head()

# Drop Columns/NA values

In [None]:
data.isnull().sum()

In [None]:
data = (data.drop(data.columns[0], axis=1)).dropna()
print(len(data))
data.head()

# WordCloud of Insults & Targets

In [None]:
tweet_text = " ".join(data.tweet)

wordcloud = WordCloud(width=1500, height=500).generate(tweet_text)

plt.figure( figsize=(20,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
target_text = " ".join(data.target)

wordcloud = WordCloud(width=1500, height=500).generate(target_text)

plt.figure( figsize=(20,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

# Insults per Target

In [None]:
(data.groupby("target").target.count())\
    .nlargest(10)\
    .plot(kind="barh")

# Sorting by Date

In [None]:
data["date"] = pd.to_datetime(data["date"])
print(data["date"].dtype)
data.head()

In [None]:
plt.title("Insults over Years")
plt.xlabel("Year")
plt.ylabel("Percentage%")
data["year"] = data["date"].dt.year
insults_over_year = data["year"].value_counts().drop(2021) # 2021 is not completed as of yet
sns.lineplot(insults_over_year.index, insults_over_year.values)

# Text Classification to Year Tweeted

In [None]:
nlp = spacy.blank("en")

textcat = nlp.create_pipe(
              "textcat",
              config={
                "exclusive_classes": True,
                "architecture": "bow"})

nlp.add_pipe(textcat)

In [None]:
labels = np.unique(data["year"].values).tolist()
labels = [textcat.add_label(str(label)) for label in labels]

In [None]:
train_text = data["tweet"].values
train_label = [{'cats': {'2014': label == '2014',
                          '2015': label == '2015',
                          '2016': label == '2016',
                          '2017': label == '2017',
                          '2018': label == '2018',
                          '2019': label == '2019',
                          '2020': label == '2020',
                          '2021': label == '2021',
                          '2022': label == '2022'}} for label in data["year"]]

train_data = list(zip(train_text, train_label))
train_data[0]

In [None]:
optimizer = nlp.begin_training()

losses = {}
for epoch in range(10):
    random.shuffle(train_data)
    batches = spacy.util.minibatch(train_data, size=4)
    for batch in batches:
        texts, labels = zip(*batch)
        nlp.update(texts, labels, sgd=optimizer, losses=losses)
    print(losses)

## Making Predictions

In [None]:
texts = ["GET SLEEPY JOE", 
         "GET HILLARY CLINTON",]
docs = [nlp.tokenizer(text) for text in texts]

textcat = nlp.get_pipe('textcat')
scores, _ = textcat.predict(docs)

print(scores)

predicted_labels = scores.argmax(axis=1)
print([textcat.labels[label] for label in predicted_labels])