In [1]:
!git clone https://github.com/robinloh/H6751TextandWebMining.git

Cloning into 'H6751TextandWebMining'...
remote: Enumerating objects: 33, done.[K
remote: Total 33 (delta 0), reused 0 (delta 0), pack-reused 33[K
[KUnpacking objects: 100% (33/33), done.
[KUpdating files: 100% (14/14), done.


In [2]:
import nltk
import pandas as pd
import re as regex

pd.set_option('display.max_colwidth',5000)

!pip install plotly
!pip install textblob

import plotly
from plotly import graph_objs

# plotly configuration
#plotly.offline.init_notebook_mode()



In [3]:
url = "H6751TextandWebMining/train.csv"
df = pd.read_csv(url) #READING
df.head(5)

Unnamed: 0,id,text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",0,0,0,0,0,0
1,000103f0d9cfb60f,"D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)",0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of """"types of accidents"""" -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\n\nThere appears to be a backlog on articles for review so I guess there may be a delay until a reviewer turns up. It's listed in the relevant form eg Wikipedia:Good_article_nominations#Transport """,0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember what page that's on?",0,0,0,0,0,0


In [6]:
class Initialize():
    data = []
    processed_data = []
    wordlist = []

    data_model = None
    data_labels = None
    is_testing = False

    def initialize(self, csv_file):  # read wiki data
        
        self.data = pd.read_csv(csv_file, header=0, names=["id", "text", "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"])
        self.data = self.data[self.data["toxic"].isin(["0", "1"])]
        self.data = self.data[self.data["severe_toxic"].isin(["0", "1"])]
        self.data = self.data[self.data["obscene"].isin(["0", "1"])]
        self.data = self.data[self.data["threat"].isin(["0", "1"])]
        self.data = self.data[self.data["insult"].isin(["0", "1"])]
        self.data = self.data[self.data["identity_hate"].isin(["0", "1"])]
        
        self.processed_data = self.data
        self.wordlist = []
        self.data_model = None
        self.data_labels = None
        

In [7]:
class ExtraFeatures(Initialize):
    def __init__(self):
        pass
    
    def build_data_model(self):   
        extra_columns = [col for col in self.processed_data.columns if col.startswith("number_of")]
        label_column = []
        if not self.is_testing:
            label_column = ["label"]

        columns = label_column + extra_columns + list(
            map(lambda w: w + "_bow",self.wordlist))
        
        labels = []
        rows = []

        self.data_model = pd.DataFrame(rows, columns=columns)
        self.data_labels = pd.Series(labels)
        return self.data_model, self.data_labels

    def add_column(self, column_name, column_content):
        self.processed_data.loc[:, column_name] = pd.Series(column_content, index=self.processed_data.index)
    
    def build_features(self):  # add additional features
        def count_by_lambda(expression, word_array):
            return len(list(filter(expression, word_array)))

        def count_occurences(character, word_array):
            counter = 0
            for j, word in enumerate(word_array):
                for char in word:
                    if char == character:
                        counter += 1

            return counter

        def count_by_regex(regex, plain_text):
            return len(regex.findall(plain_text))


        # number of !
        exclamations = list(map(lambda txt: count_occurences("!", txt), self.processed_data["text"]))

        self.add_column("number_of_exclamation", exclamations)

        # number of ?
        questions = list(map(lambda txt: count_occurences("?", txt), self.processed_data["text"]))
                             

        self.add_column("number_of_question", questions)

        # number of ...
        ellipsis = list(map(lambda txt: count_by_regex(regex.compile(r"\.\s?\.\s?\."), txt), self.processed_data["text"]))
                            
        self.add_column("number_of_ellipsis", ellipsis)

        # number of hashtags
        hashtags = list(map(lambda txt: count_occurences("#", txt), self.processed_data["text"]))
                            
        self.add_column("number_of_hashtags", hashtags)

        # number of mentions
        mentions = list(map(lambda txt: count_occurences("@", txt),
                            self.processed_data["text"]))

        self.add_column("number_of_mentions", mentions)

In [9]:
data = ExtraFeatures()
data.initialize("H6751TextandWebMining/train.csv")  
data.build_features()         # build additional features with original text
data.processed_data.head(50)

Unnamed: 0,id,text,toxic,severe_toxic,obscene,threat,insult,identity_hate,number_of_exclamation,number_of_question,number_of_ellipsis,number_of_hashtags,number_of_mentions
0,0000997932d777bf,"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",0,0,0,0,0,0,0,1,0,0,0
1,000103f0d9cfb60f,"D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)",0,0,0,0,0,0,1,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",0,0,0,0,0,0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of """"types of accidents"""" -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\n\nThere appears to be a backlog on articles for review so I guess there may be a delay until a reviewer turns up. It's listed in the relevant form eg Wikipedia:Good_article_nominations#Transport """,0,0,0,0,0,0,0,0,0,1,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember what page that's on?",0,0,0,0,0,0,0,1,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the tools well. · talk """,0,0,0,0,0,0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0,0,0,0,0,0
7,00031b1e95af7921,"Your vandalism to the Matt Shirvington article has been reverted. Please don't do it again, or you will be banned.",0,0,0,0,0,0,0,0,0,0,0
8,00037261f536c51d,"Sorry if the word 'nonsense' was offensive to you. Anyway, I'm not intending to write anything in the article(wow they would jump on me for vandalism), I'm merely requesting that it be more encyclopedic so one can use it for school as a reference. I have been to the selective breeding page but it's almost a stub. It points to 'animal breeding' which is a short messy article that gives you no info. There must be someone around with expertise in eugenics? 93.161.107.169",0,0,0,0,0,0,0,1,0,0,0
9,00040093b2687caa,alignment on this subject and which are contrary to those of DuLithgow,0,0,0,0,0,0,0,0,0,0,0


In [10]:
data.processed_data.to_csv('H6751TextandWebMining/Feature_Extract.csv', index=False)

In [11]:
%cd H6751TextandWebMining
!git init
!git pull origin master
!git add Feature_Extract.csv
!git config --global user.email “zloh012@e.ntu.edu.sg”
!git config --global user.name “robinloh”
!git remote remove origin
!git remote add origin https://robinloh:SJB8030%2Fg9@github.com/robinloh/H6751TextandWebMining.git
!git commit -m "Feature Extraction"
!git push -u origin master

/Users/robin/H6751TextandWebMining/H6751TextandWebMining
Reinitialized existing Git repository in /Users/robin/H6751TextandWebMining/H6751TextandWebMining/.git/
From https://github.com/robinloh/H6751TextandWebMining
 * branch            master     -> FETCH_HEAD
Already up to date.
On branch master
nothing to commit, working tree clean
Branch 'master' set up to track remote branch 'master' from 'origin'.
Everything up-to-date
