# Imports

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
%cd drive/MyDrive/Fax/FINKI/Semestar-7/NLP/Project/

/content/drive/MyDrive/Fax/FINKI/Semestar-7/NLP/Project


In [4]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [17]:
import pandas as pd
import numpy as np
import os
import nltk
import re
import seaborn as sns
import spacy
import xml.etree.ElementTree as ET
from nltk.tokenize import word_tokenize
from collections import Counter
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) 
lemmatizer = nltk.WordNetLemmatizer()
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

# Data 

In [25]:
stemmer = SnowballStemmer(language='english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

In [6]:
pwd = '/content/drive/MyDrive/Fax/FINKI/Semestar-7/NLP/Project'
data_folder = f'{pwd}/data/'
truth_file = f'{pwd}/data/truth.txt'

In [7]:
truth_df = pd.read_csv(truth_file, delimiter=":::", header=None)
truth_df.columns = ['id', 'class']

  """Entry point for launching an IPython kernel.


In [8]:
data_files = os.listdir(data_folder)

In [9]:
tweets = []
class_list = []
for file in data_files:
    if file.endswith('.xml'):
        root = ET.parse(data_folder + file).getroot()
        for node in root.iter('documents'):
            for elem in node.iter():
                if not elem.tag==node.tag:
                    class_item = truth_df[truth_df['id'] == file[:-4]]['class'].item()
                    class_list.append(class_item)
                    tweets.append(elem.text)

In [10]:
df = pd.DataFrame({'tweet': tweets, 'class': class_list})

In [11]:
df.to_csv("raw_data.csv")

Unnamed: 0,tweet,class
0,Mississippi Governor Bans Transgenders From Pa...,0
1,LIBERAL LUNACY: Ice Cream Flavor Name Changed ...,0
2,"AOC, Nadler Call on N.Y. Gov. Andrew Cuomo to ...",0
3,WATCH: Mark Levin goes NUCLEAR on Joe Biden fo...,0
4,New York Legislature Just Took 'First Step' To...,0
...,...,...
39995,😂😂😂periodt i had to put my name in all CAPS #URL#,1
39996,"RT #USER#: sorry if im becoming distant, im tr...",1
39997,RT #USER#: My next hair appointment the only t...,1
39998,RT #USER#: One thing about me ima go to sleep. 😂,1


In [13]:
df['tweet_low'] = df['tweet'].apply(lambda x: x if type(x)!=str else x.lower())

In [14]:
df['no_url'] = [tweet.replace('#url#', '') for tweet  in df['tweet_low']]

In [15]:
df['no_user'] = [tweet.replace('#user#', '') for tweet  in df['no_url']]

In [16]:
df['no_user_no_special'] = df['no_user'].str.replace("[^a-zA-Z#']", " ")

In [32]:
tweets = df['no_user_no_special']
tweets_clean = []
for tweet in tweets:
    tweet = nltk.word_tokenize(tweet)
    tweet = [word for word in tweet if not word in stop_words]
    tweet = [re.sub(r'[^\w\s]','',word) for word in tweet]
    tweet = [lemmatize_stemming(each_word) for each_word in tweet]
    tweet = [word for word in tweet if len(word)>3]
    tweets_clean.append(' '.join(tweet))

In [33]:
df['removed_stop_and_lem'] = tweets_clean

In [34]:
df['tweet_length'] = df['removed_stop_and_lem'].apply(lambda x: len(x.split()))

In [35]:
df['tweet_length'].value_counts()

4     4845
5     4575
3     4449
6     4200
7     3816
8     3540
2     3054
9     2938
10    2545
11    1801
12    1280
1     1038
13     905
14     488
15     228
16     108
0       88
17      53
18      23
19      21
20       4
21       1
Name: tweet_length, dtype: int64

In [None]:
df['removed_stop_and_lem']

mississippi governor ban transgend particip femal sport
Mississippi Governor Bans Transgenders From Participating In Female Sports #URL#


In [None]:
df = df[df['tweet_length'] > 2]

In [None]:
df

Unnamed: 0,tweet,class,tweet_low,no_url,no_user,no_user_no_special,removed_stop_and_lem,tweet_length
0,Mississippi Governor Bans Transgenders From Pa...,0,mississippi governor bans transgenders from pa...,mississippi governor bans transgenders from pa...,mississippi governor bans transgenders from pa...,mississippi governor bans transgenders from pa...,mississippi governor ban transgenders particip...,7
1,LIBERAL LUNACY: Ice Cream Flavor Name Changed ...,0,liberal lunacy: ice cream flavor name changed ...,liberal lunacy: ice cream flavor name changed ...,liberal lunacy: ice cream flavor name changed ...,liberal lunacy ice cream flavor name changed ...,liberal lunacy ice cream flavor name changed d...,10
2,"AOC, Nadler Call on N.Y. Gov. Andrew Cuomo to ...",0,"aoc, nadler call on n.y. gov. andrew cuomo to ...","aoc, nadler call on n.y. gov. andrew cuomo to ...","aoc, nadler call on n.y. gov. andrew cuomo to ...",aoc nadler call on n y gov andrew cuomo to ...,aoc nadler call n gov andrew cuomo resign via,9
3,WATCH: Mark Levin goes NUCLEAR on Joe Biden fo...,0,watch: mark levin goes nuclear on joe biden fo...,watch: mark levin goes nuclear on joe biden fo...,watch: mark levin goes nuclear on joe biden fo...,watch mark levin goes nuclear on joe biden fo...,watch mark levin go nuclear joe biden trying t...,12
4,New York Legislature Just Took 'First Step' To...,0,new york legislature just took 'first step' to...,new york legislature just took 'first step' to...,new york legislature just took 'first step' to...,new york legislature just took 'first step' to...,new york legislature took first step toward i...,10
...,...,...,...,...,...,...,...,...
39995,😂😂😂periodt i had to put my name in all CAPS #URL#,1,😂😂😂periodt i had to put my name in all caps #url#,😂😂😂periodt i had to put my name in all caps,😂😂😂periodt i had to put my name in all caps,periodt i had to put my name in all caps,periodt put name cap,4
39996,"RT #USER#: sorry if im becoming distant, im tr...",1,"rt #user#: sorry if im becoming distant, im tr...","rt #user#: sorry if im becoming distant, im tr...","rt : sorry if im becoming distant, im trying t...",rt sorry if im becoming distant im trying t...,rt sorry im becoming distant im trying,7
39997,RT #USER#: My next hair appointment the only t...,1,rt #user#: my next hair appointment the only t...,rt #user#: my next hair appointment the only t...,rt : my next hair appointment the only thing i...,rt my next hair appointment the only thing i...,rt next hair appointment thing worried,6
39998,RT #USER#: One thing about me ima go to sleep. 😂,1,rt #user#: one thing about me ima go to sleep. 😂,rt #user#: one thing about me ima go to sleep. 😂,rt : one thing about me ima go to sleep. 😂,rt one thing about me ima go to sleep,rt one thing ima go sleep,6


In [None]:
df.to_csv('dataset.csv')