# Imports

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
%cd drive/MyDrive/Fax/FINKI/Semestar-7/NLP/Project/

/content/drive/MyDrive/Fax/FINKI/Semestar-7/NLP/Project


In [4]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [336]:
import pandas as pd
import numpy as np
import os
import nltk
import re
import seaborn as sns
import spacy
import xml.etree.ElementTree as ET
from nltk.tokenize import word_tokenize
from collections import Counter
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) 
lemmatizer = nltk.WordNetLemmatizer()
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

In [337]:
results_folder = f'./Results/'

In [338]:
os.makedirs(results_folder, exist_ok=True)

In [339]:
pwd = '/content/drive/MyDrive/Fax/FINKI/Semestar-7/NLP/Project'
data_folder = f'{pwd}/data/'
truth_file = f'{pwd}/data/truth.txt'

# Data 

In [340]:
stemmer = SnowballStemmer(language='english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

In [341]:
def flat_list(var):
    flat = [item for sublist in var for item in sublist]
    return flat

In [342]:
truth_df = pd.read_csv(truth_file, delimiter=":::", header=None)
truth_df.columns = ['id', 'class']

  """Entry point for launching an IPython kernel.


In [343]:
data_files = os.listdir(data_folder)

In [399]:
tweets = []
class_list = []
for file in data_files:
    if file.endswith('.xml'):
        root = ET.parse(data_folder + file).getroot()
        for node in root.iter('documents'):
            for elem in node.iter():
                if not elem.tag==node.tag:
                    class_item = truth_df[truth_df['id'] == file[:-4]]['class'].item()
                    class_list.append(class_item)
                    tweets.append(elem.text)

In [400]:
df = pd.DataFrame({'tweet': tweets, 'class': class_list})

In [401]:
df['tweet_low'] = df['tweet'].apply(lambda x: x if type(x)!=str else x.lower())

In [402]:
df['no_url'] = [tweet.replace('#url#', '') for tweet  in df['tweet_low']]

In [403]:
df['no_user'] = [tweet.replace('#user#', '') for tweet  in df['no_url']]

In [404]:
df['no_hashtag'] = [tweet.replace('#hashtag#', '') for tweet  in df['no_user']]

In [405]:
df['no_user_no_special'] = df['no_hashtag'].str.replace("[^a-zA-Z#']", " ")

In [406]:
tweets = df['no_user_no_special']
tweets_clean = []
for tweet in tweets:
    tweet = nltk.word_tokenize(tweet)
    tweet = [word for word in tweet if not word in stop_words]
    tweet = [re.sub(r'[^\w\s]','',word) for word in tweet]
    tweet = [lemmatizer.lemmatize(each_word, pos='v') for each_word in tweet]
    tweet = [word for word in tweet if len(word)>2]
    tweets_clean.append(' '.join(tweet))

In [407]:
df['removed_stop_and_lem'] = tweets_clean

In [408]:
tweets = df['no_user_no_special']
tweets_clean = []
for tweet in tweets:
    tweet = nltk.word_tokenize(tweet)
    tweet = [word for word in tweet if not word in stop_words]
    tweet = [re.sub(r'[^\w\s]','',word) for word in tweet]
    tweet = [word for word in tweet if len(word)>2]
    tweets_clean.append(' '.join(tweet))

In [409]:
df['removed_stop'] = tweets_clean

In [410]:
df_renamed = df.rename(columns={'tweet': 'raw_tweet', 'class':'class', 'removed_stop':'tweet'})

In [411]:
df_renamed = df_renamed[['tweet', 'class']]
df_renamed.to_csv("./raw_data.csv")

In [412]:
df['tweet_length'] = df['removed_stop_and_lem'].apply(lambda x: len(x.split()))

In [413]:
print(df['removed_stop_and_lem'][0])
print(df['tweet'][0])

mississippi governor ban transgenders participate female sport
Mississippi Governor Bans Transgenders From Participating In Female Sports #URL#


In [414]:
df = df[df['tweet_length'] > 2]

In [415]:
df

Unnamed: 0,tweet,class,tweet_low,no_url,no_user,no_hashtag,no_user_no_special,removed_stop_and_lem,removed_stop,tweet_length
0,Mississippi Governor Bans Transgenders From Pa...,0,mississippi governor bans transgenders from pa...,mississippi governor bans transgenders from pa...,mississippi governor bans transgenders from pa...,mississippi governor bans transgenders from pa...,mississippi governor bans transgenders from pa...,mississippi governor ban transgenders particip...,mississippi governor bans transgenders partici...,7
1,LIBERAL LUNACY: Ice Cream Flavor Name Changed ...,0,liberal lunacy: ice cream flavor name changed ...,liberal lunacy: ice cream flavor name changed ...,liberal lunacy: ice cream flavor name changed ...,liberal lunacy: ice cream flavor name changed ...,liberal lunacy ice cream flavor name changed ...,liberal lunacy ice cream flavor name change du...,liberal lunacy ice cream flavor name changed d...,10
2,"AOC, Nadler Call on N.Y. Gov. Andrew Cuomo to ...",0,"aoc, nadler call on n.y. gov. andrew cuomo to ...","aoc, nadler call on n.y. gov. andrew cuomo to ...","aoc, nadler call on n.y. gov. andrew cuomo to ...","aoc, nadler call on n.y. gov. andrew cuomo to ...",aoc nadler call on n y gov andrew cuomo to ...,aoc nadler call gov andrew cuomo resign via,aoc nadler call gov andrew cuomo resign via,8
3,WATCH: Mark Levin goes NUCLEAR on Joe Biden fo...,0,watch: mark levin goes nuclear on joe biden fo...,watch: mark levin goes nuclear on joe biden fo...,watch: mark levin goes nuclear on joe biden fo...,watch: mark levin goes nuclear on joe biden fo...,watch mark levin goes nuclear on joe biden fo...,watch mark levin nuclear joe biden try take cr...,watch mark levin goes nuclear joe biden trying...,11
4,New York Legislature Just Took 'First Step' To...,0,new york legislature just took 'first step' to...,new york legislature just took 'first step' to...,new york legislature just took 'first step' to...,new york legislature just took 'first step' to...,new york legislature just took 'first step' to...,new york legislature take first step toward im...,new york legislature took first step toward im...,10
...,...,...,...,...,...,...,...,...,...,...
39995,😂😂😂periodt i had to put my name in all CAPS #URL#,1,😂😂😂periodt i had to put my name in all caps #url#,😂😂😂periodt i had to put my name in all caps,😂😂😂periodt i had to put my name in all caps,😂😂😂periodt i had to put my name in all caps,periodt i had to put my name in all caps,periodt put name cap,periodt put name caps,4
39996,"RT #USER#: sorry if im becoming distant, im tr...",1,"rt #user#: sorry if im becoming distant, im tr...","rt #user#: sorry if im becoming distant, im tr...","rt : sorry if im becoming distant, im trying t...","rt : sorry if im becoming distant, im trying t...",rt sorry if im becoming distant im trying t...,sorry become distant try,sorry becoming distant trying,4
39997,RT #USER#: My next hair appointment the only t...,1,rt #user#: my next hair appointment the only t...,rt #user#: my next hair appointment the only t...,rt : my next hair appointment the only thing i...,rt : my next hair appointment the only thing i...,rt my next hair appointment the only thing i...,next hair appointment thing worry,next hair appointment thing worried,5
39998,RT #USER#: One thing about me ima go to sleep. 😂,1,rt #user#: one thing about me ima go to sleep. 😂,rt #user#: one thing about me ima go to sleep. 😂,rt : one thing about me ima go to sleep. 😂,rt : one thing about me ima go to sleep. 😂,rt one thing about me ima go to sleep,one thing ima sleep,one thing ima sleep,4


In [416]:
df = df.rename(columns={'tweet': 'raw_tweet', 'class':'class', 'removed_stop_and_lem':'tweet'})

In [417]:
df[['tweet', 'class']].to_csv('./dataset.csv')

In [418]:
df_final = df[['tweet', 'class']]

# Cleaned data

In [419]:
df = pd.read_csv('./dataset.csv')

In [420]:
df_hate = df[df['class'] == 1]
df_no_hate = df[df['class'] == 0]

In [421]:
tweets_hate = df_hate['tweet']
tweets_hate = (list(tweets_hate))
tweets_hate = [x for x in tweets_hate if str(x) != 'nan']
tweets_no_hate = df_no_hate['tweet']
tweets_no_hate = (list(tweets_no_hate))
tweets_no_hate = [x for x in tweets_no_hate if str(x) != 'nan']

## Average words in tweets after cleaning

In [422]:
number_of_words = 0
tweet_len = []
for tweet in tweets_hate:
    number_of_words += len(tweet)
    tweet_len.append(len(tweet))
avg_words_in_tweet = round(number_of_words/len(df_hate))
print((number_of_words/len(df_hate)))

38.48608355876165


In [423]:
number_of_words = 0
tweet_len = []
for tweet in tweets_no_hate:
    number_of_words += len(tweet)
    tweet_len.append(len(tweet))
avg_words_in_tweet = round(number_of_words/len(df_hate))
print((number_of_words/len(df_hate)))

38.18755635707845


## Word counts

In [424]:
tweets_hate_flat = [' '.join(tweets_hate[:])]

In [425]:
tweets_hate_flat = tweets_hate_flat[0].split()

In [426]:
tweets_hate_count = Counter(tweets_hate_flat)

In [427]:
tweets_hate_count_df = pd.DataFrame(tweets_hate_count.most_common())
tweets_hate_count_df.columns = ['words', 'count']

In [428]:
tweets_hate_count_df_10 = tweets_hate_count_df[tweets_hate_count_df['count'] > 10]
tweets_hate_count_df_10.to_csv('./Results/tweets_hate_count_df_10.csv')

In [429]:
tweets_no_hate_flat = [' '.join(tweets_no_hate[:])]

In [430]:
tweets_no_hate_flat = tweets_no_hate_flat[0].split()

In [431]:
tweets_no_hate_count = Counter(tweets_no_hate_flat)

In [432]:
tweets_no_hate_count_df = pd.DataFrame(tweets_no_hate_count.most_common())
tweets_no_hate_count_df.columns = ['words', 'count']

In [433]:
tweets_no_hate_count_df_10 = tweets_no_hate_count_df[tweets_no_hate_count_df['count'] > 10]
tweets_no_hate_count_df_10.to_csv('./Results/tweets_no_hate_count_df_10.csv')

In [434]:
tweets_no_hate_count_df_10

Unnamed: 0,words,count
0,get,1337
1,like,944
2,one,739
3,people,666
4,say,610
...,...,...
1653,tuesday,11
1654,title,11
1655,guest,11
1656,fkn,11


# Raw data

In [435]:
df = pd.read_csv('./raw_data.csv')

In [436]:
df_hate = df[df['class'] == 1]
df_no_hate = df[df['class'] == 0]

In [437]:
tweets_hate = df_hate['tweet']
tweets_hate = (list(tweets_hate))
tweets_hate = [x for x in tweets_hate if str(x) != 'nan']
tweets_no_hate = df_no_hate['tweet']
tweets_no_hate = (list(tweets_no_hate))
tweets_no_hate = [x for x in tweets_no_hate if str(x) != 'nan']

## Average words in tweets after cleaning

In [438]:
number_of_words = 0
tweet_len = []
for tweet in tweets_hate:
    number_of_words += len(tweet)
    tweet_len.append(len(tweet))
avg_words_in_tweet = round(number_of_words/len(df_hate))
print((number_of_words/len(df_hate)))

35.1442


In [439]:
number_of_words = 0
tweet_len = []
for tweet in tweets_no_hate:
    number_of_words += len(tweet)
    tweet_len.append(len(tweet))
avg_words_in_tweet = round(number_of_words/len(df_hate))
print((number_of_words/len(df_hate)))

34.97385


## Word counts

In [440]:
tweets_hate_flat = [' '.join(tweets_hate[:])]

In [441]:
tweets_hate_flat = tweets_hate_flat[0].split()

In [442]:
tweets_hate_count = Counter(tweets_hate_flat)

In [443]:
tweets_hate_count_df = pd.DataFrame(tweets_hate_count.most_common())
tweets_hate_count_df.columns = ['words', 'count']

In [444]:
tweets_hate_count_df_10 = tweets_hate_count_df[tweets_hate_count_df['count'] > 10]
tweets_hate_count_df_10.to_csv('./Results/raw_tweets_hate_count_df_10.csv')

In [445]:
tweets_no_hate_flat = [' '.join(tweets_no_hate[:])]

In [446]:
tweets_no_hate_flat = tweets_no_hate_flat[0].split()

In [447]:
tweets_no_hate_count = Counter(tweets_no_hate_flat)

In [448]:
tweets_no_hate_count_df = pd.DataFrame(tweets_no_hate_count.most_common())
tweets_no_hate_count_df.columns = ['words', 'count']

In [449]:
tweets_no_hate_count_df_10 = tweets_no_hate_count_df[tweets_no_hate_count_df['count'] > 10]
tweets_no_hate_count_df_10.to_csv('./Results/raw_tweets_no_hate_count_df_10.csv')