In [18]:
import os
import pandas as pd
import re
import string
import nltk

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [19]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/andrei-
[nltk_data]     cristian/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/andrei-
[nltk_data]     cristian/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Preprocessing

In [20]:
RAW_DATA_FOLDER_PATH = "../data/raw"
PREPROCESSED_DATA_FOLDER_PATH = "../data/processed"
PREPROCESSED_FILE_NAME = "preprocessed_data.csv"

In [21]:
csv_paths = [os.path.join(RAW_DATA_FOLDER_PATH, file) for file in os.listdir(RAW_DATA_FOLDER_PATH)]

df = pd.concat([pd.read_csv(csv_path) for csv_path in csv_paths])

In [22]:
df.drop(['COMMENT_ID', 'AUTHOR', 'DATE'], axis=1, inplace=True)


Unnamed: 0,index,CONTENT,CLASS
0,0,+447935454150 lovely girl talk to me xxx﻿,1
1,1,I always end up coming back to this song<br />﻿,0
2,2,"my sister just received over 6,500 new <a rel=...",1
3,3,Cool﻿,0
4,4,Hello I&#39;am from Palastine﻿,1
...,...,...,...
1951,345,This song means so much to me thank you soooo...,0
1952,346,&lt;3﻿,0
1953,347,"KATY PERRY, I AM THE ""DÉCIO CABELO"", ""DECIO HA...",1
1954,348,Honestly speaking except taylor swift and adel...,0


In [6]:
df.head(5)

Unnamed: 0,CONTENT,CLASS
0,+447935454150 lovely girl talk to me xxx﻿,1
1,I always end up coming back to this song<br />﻿,0
2,"my sister just received over 6,500 new <a rel=...",1
3,Cool﻿,0
4,Hello I&#39;am from Palastine﻿,1


In [7]:
df.shape

(1956, 2)

In [8]:
def remove_emoji(text: str) -> str:
    text = " ".join(filter(lambda x:x[0]!='@', text.split()))
    emoji_re = re.compile("["
                           u"\U0001F600-\U0001FFFF"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
        
    return emoji_re.sub(r'', text)
    
def normalize_lower(text: str) -> str:    
    return text.lower()
    
def strip_spaces(text: str) -> str:
    return text.strip()
    
def remove_contractions(text: str) -> str:   
    contractions = {"i'm": "i am",
                    "he's": "he is",
                    "she's": "she is",
                    "it's": "it is",
                    "that's": "that is",
                    "what's": "what is",
                    "where's": "where is",
                    "\'ll": " will",
                    "\'ve": " have",
                    "\'re": " are",
                    "\'d": " would",
                    "\'ve": " have",
                    "\'t": " not"}
    for key, value in contractions.items():
        text = re.sub(key, value, text)
    return text

def remove_links(text: str) -> str:
    pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    return pattern.sub('', text)

In [9]:
preprocessing_steps = (remove_emoji, remove_links, normalize_lower, remove_contractions)

stop_words = set(stopwords.words("english"))

def preprocess(s: pd.Series) -> pd.Series:
    comments = []
    lines = s.astype(str).values.tolist()
    for line in lines:
        for step in preprocessing_steps:
            line = step(line)
        tokens = word_tokenize(line)
        
        table = str.maketrans("", "", string.punctuation)
        tokens = [token.translate(table) for token in tokens]
        
        tokens = " ".join([token for token in tokens if token not in stop_words])
        re.sub("\s\s+", ' ', tokens)
        tokens = strip_spaces(tokens)
        comments.append(tokens)
    return comments

In [10]:
df

Unnamed: 0,CONTENT,CLASS
0,+447935454150 lovely girl talk to me xxx﻿,1
1,I always end up coming back to this song<br />﻿,0
2,"my sister just received over 6,500 new <a rel=...",1
3,Cool﻿,0
4,Hello I&#39;am from Palastine﻿,1
...,...,...
345,This song means so much to me thank you soooo...,0
346,&lt;3﻿,0
347,"KATY PERRY, I AM THE ""DÉCIO CABELO"", ""DECIO HA...",1
348,Honestly speaking except taylor swift and adel...,0


In [11]:
df["CONTENT"] = preprocess(df["CONTENT"])

df.rename(columns={"CONTENT": "content",
          "CLASS": "label"}, inplace=True)

In [12]:
len(df)

1956

In [13]:
df = df[df['content'] != '']

In [14]:
len(df)

1900

In [15]:
df.to_csv(os.path.join(PREPROCESSED_DATA_FOLDER_PATH, PREPROCESSED_FILE_NAME))

In [16]:
lengths = df["content"].apply(lambda x : len(x.split()))
max(lengths)

98