Text Preprocessing
--------
Text preprocessing made on the competition datasets.
The preprocessing consists of 4 steps:

 1. **Removing tags and URIs from contents**
 2. **Removing punctuation from titles and contents**
 3. **Removing stopwords from titles and contents**
 4. **Converting the tags from string to a list of tags**

This type of operations can be used as a first step for any other process regarding the competition.

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import re
import string

Datasets loading
---------

In [2]:
dataframes = {
    "cooking": pd.read_csv("../input/cooking.csv"),
    "crypto": pd.read_csv("../input/crypto.csv"),
    "robotics": pd.read_csv("../input/robotics.csv"),
    "biology": pd.read_csv("../input/biology.csv"),
    "travel": pd.read_csv("../input/travel.csv"),
    "diy": pd.read_csv("../input/diy.csv"),
}

For simplicity, i'll show an example of the steps of the preprocessing on an item of the robotics dataset

In [4]:
print(dataframes["robotics"].iloc[1])
dataframes["robotics"].head()

Removing html tags and uris from contents
-----------

In [5]:
uri_re = r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))'

def stripTagsAndUris(x):
    if x:
        # BeautifulSoup on content
        soup = BeautifulSoup(x, "html.parser")
        # Stripping all <code> tags with their content if any
        if soup.code:
            soup.code.decompose()
        # Get all the text out of the html
        text =  soup.get_text()
        # Returning text stripping out all uris
        return re.sub(uri_re, "", text)
    else:
        return ""

In [8]:
dataframes["robotics"].head()

In [6]:
# This could take a while
for df in dataframes.values():
    df["content"] = df["content"].map(stripTagsAndUris)

In [7]:
print(dataframes["robotics"].iloc[1])

Removing punctuation from titles and contents
-----------

In [None]:
def removePunctuation(x):
    # Lowercasing all words
    x = x.lower()
    # Removing non ASCII chars
    x = re.sub(r'[^\x00-\x7f]',r' ',x)
    # Removing (replacing with empty spaces actually) all the punctuations
    return re.sub("["+string.punctuation+"]", " ", x)

In [None]:
for df in dataframes.values():
    df["title"] = df["title"].map(removePunctuation)
    df["content"] = df["content"].map(removePunctuation)

In [None]:
print(dataframes["robotics"].iloc[1])

Removing stopwords from titles and contents
-----------

In [None]:
stops = set(stopwords.words("english"))
def removeStopwords(x):
    # Removing all the stopwords
    filtered_words = [word for word in x.split() if word not in stops]
    return " ".join(filtered_words)

In [None]:
for df in dataframes.values():
    df["title"] = df["title"].map(removeStopwords)
    df["content"] = df["content"].map(removeStopwords)

In [None]:
print(dataframes["robotics"].iloc[1])

Splitting tags string in a list of tags
-----------

In [9]:
for df in dataframes.values():
    # From a string sequence of tags to a list of tags
    df["tags"] = df["tags"].map(lambda x: x.split())

In [10]:
dataframes["robotics"].head()

### Total number of labels

In [11]:
Total_labels = []
for i, val in dataframes["robotics"]['tags'].iteritems():
    #print('value: ', val)
    for v in val:
        #if v not in Total_labels:
        Total_labels.append(v)
        #print(v)
print("length of total labels")  
display(len(Total_labels))
# display(Total_labels)

In [12]:
## Chcek to see if there is any duplicate labels 
import operator
from collections import Counter
Labels_freq = dict(Counter(Total_labels))
Labels_freq = sorted(Labels_freq.items(), key=operator.itemgetter(1), reverse= True)
Labels_freq = dict(Labels_freq)
# print ("Labels and frequency")
# display(Labels_freq)
print("Unique lables")
print(len(Labels_freq))
# Labels_freq


In [13]:
D = pd.DataFrame(list(Labels_freq.items()))
D

In [14]:
Plot_D = D.head(50)

In [15]:
ax = Plot_D.plot(kind='bar',title ="Top 50 tags in dataset", 
                 fontsize=12,
                 figsize=(15,10), x=0, y=1,legend=False)
ax.set_xlabel("Tags",fontsize=12)
ax.set_ylabel("Frequency",fontsize=12)

In [None]:
print(dataframes["robotics"].iloc[1])

Saving preprocessed dataframes to csv
-----------

In [None]:
for name, df in dataframes.items():
    # Saving to file
    df.to_csv(name + "_light.csv", index=False)